mirror of https://github.com/Bunsly/JobSpy
Google sheets integration (#22)
parent
d10dce6913
commit
65bfcb14d4
|
@ -63,11 +63,13 @@ jobs:
|
||||||
|
|
||||||
- name: Check error field in response
|
- name: Check error field in response
|
||||||
run: |
|
run: |
|
||||||
|
global_error=$(jq '.error' response.json)
|
||||||
indeed_error=$(jq '.indeed.error' response.json)
|
indeed_error=$(jq '.indeed.error' response.json)
|
||||||
linkedin_error=$(jq '.linkedin.error' response.json)
|
linkedin_error=$(jq '.linkedin.error' response.json)
|
||||||
|
|
||||||
if [[ "$indeed_error" != "null" || "$linkedin_error" != "null" ]]; then
|
if [[ "$indeed_error" != "null" || "$linkedin_error" != "null" ]]; then
|
||||||
echo "Error found in response:"
|
echo "Error found in response:"
|
||||||
|
echo "Global Error: $global_error"
|
||||||
echo "Indeed Error: $indeed_error"
|
echo "Indeed Error: $indeed_error"
|
||||||
echo "LinkedIn Error: $linkedin_error"
|
echo "LinkedIn Error: $linkedin_error"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -4,4 +4,5 @@
|
||||||
/ven/
|
/ven/
|
||||||
**/__pycache__/
|
**/__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
.env
|
.env
|
||||||
|
client_secret.json
|
40
README.md
40
README.md
|
@ -4,8 +4,10 @@
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
|
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
|
||||||
- Returns jobs as JSON or CSV with title, location, company, description & other data
|
- Returns jobs as JSON or CSV with title, location, company, description & other data
|
||||||
|
- Imports directly into **Google Sheets**
|
||||||
- Optional JWT authorization
|
- Optional JWT authorization
|
||||||
|
|
||||||
|
![jobspy_gsheet](https://github.com/cullenwatson/JobSpy/assets/78247585/9f0a997c-4e33-4167-b04e-31ab1f606edb)
|
||||||
|
|
||||||
### API
|
### API
|
||||||
|
|
||||||
|
@ -23,7 +25,7 @@ Optional
|
||||||
├── is_remote (bool)
|
├── is_remote (bool)
|
||||||
├── results_wanted (int): per site_type
|
├── results_wanted (int): per site_type
|
||||||
├── easy_apply (bool): only for linkedin
|
├── easy_apply (bool): only for linkedin
|
||||||
└── output_format (enum): json, csv
|
└── output_format (enum): json, csv, gsheet
|
||||||
```
|
```
|
||||||
|
|
||||||
### Request Example
|
### Request Example
|
||||||
|
@ -34,6 +36,7 @@ Optional
|
||||||
"distance": 10,
|
"distance": 10,
|
||||||
"job_type": "fulltime",
|
"job_type": "fulltime",
|
||||||
"results_wanted": 15
|
"results_wanted": 15
|
||||||
|
"output_format": "gsheet"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Response Schema
|
### Response Schema
|
||||||
|
@ -63,7 +66,16 @@ JobResponse
|
||||||
├── total_results (int)
|
├── total_results (int)
|
||||||
└── returned_results (int)
|
└── returned_results (int)
|
||||||
```
|
```
|
||||||
|
### Response Example (GOOGLE SHEETS)
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "Successfully uploaded to Google Sheets",
|
||||||
|
"error": null,
|
||||||
|
"linkedin": null,
|
||||||
|
"indeed": null,
|
||||||
|
"zip_recruiter": null
|
||||||
|
}
|
||||||
|
```
|
||||||
### Response Example (JSON)
|
### Response Example (JSON)
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
@ -132,15 +144,33 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
|
||||||
4. Run the server with `uvicorn main:app --reload`
|
4. Run the server with `uvicorn main:app --reload`
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
### Google Sheets Integration (Optional)
|
||||||
|
|
||||||
### Swagger UI:
|
#### Obtaining an Access Key : [Video Guide](https://www.youtube.com/watch?v=w533wJuilao)
|
||||||
To interact with the API documentation, navigate to [localhost:8000/docs](http://localhost:8000/docs).
|
* Enable the [Google Sheets & Google Drive API](https://console.cloud.google.com/)
|
||||||
|
* Create credentials -> service account -> create & continue
|
||||||
|
* Select role -> basic: editor -> done
|
||||||
|
* Click on the email you just created in the service account list
|
||||||
|
* Go to the Keys tab -> add key -> create new key -> JSON -> Create
|
||||||
|
|
||||||
### Postman:
|
#### Using the key in the repo
|
||||||
|
* Copy the key file into the JobSpy repo as `/client_secret.json`
|
||||||
|
* Go to [my template sheet](https://docs.google.com/spreadsheets/d/1HAnn-aPv-BO4QTEzfIWc-5iw50duyMoTgX8o3RsEOWs/edit?usp=sharing) & save as a copy into your account
|
||||||
|
* Share the sheet with the email from the service account above with editor rights
|
||||||
|
* If you changed the name of the sheet, put the name in `GSHEET_NAME` in `/settings.py`
|
||||||
|
|
||||||
|
### How to call the API
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### [Postman](https://www.postman.com/downloads/) (preferred):
|
||||||
To use Postman:
|
To use Postman:
|
||||||
1. Locate the files in the `/postman/` directory.
|
1. Locate the files in the `/postman/` directory.
|
||||||
2. Import the Postman collection and environment JSON files.
|
2. Import the Postman collection and environment JSON files.
|
||||||
|
|
||||||
|
#### Swagger UI:
|
||||||
|
Or you can call the API with the interactive documentation at [localhost:8000/docs](http://localhost:8000/docs).
|
||||||
|
|
||||||
## FAQ
|
## FAQ
|
||||||
|
|
||||||
### I'm having issues with my queries. What should I do?
|
### I'm having issues with my queries. What should I do?
|
||||||
|
|
|
@ -4,11 +4,9 @@ from jose import jwt, JWTError
|
||||||
from fastapi import HTTPException, status, Depends
|
from fastapi import HTTPException, status, Depends
|
||||||
from fastapi.security import OAuth2PasswordBearer
|
from fastapi.security import OAuth2PasswordBearer
|
||||||
|
|
||||||
from settings import *
|
|
||||||
from api.core.users import TokenData
|
from api.core.users import TokenData
|
||||||
from api.auth.db_utils import UserInDB, get_user
|
from api.auth.db_utils import UserInDB, get_user
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token")
|
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,3 +4,4 @@ from enum import Enum
|
||||||
class OutputFormat(Enum):
|
class OutputFormat(Enum):
|
||||||
CSV = "csv"
|
CSV = "csv"
|
||||||
JSON = "json"
|
JSON = "json"
|
||||||
|
GSHEET = "gsheet"
|
||||||
|
|
|
@ -1,19 +1,50 @@
|
||||||
|
import gspread
|
||||||
|
from oauth2client.service_account import ServiceAccountCredentials
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ...jobs import *
|
from ...jobs import *
|
||||||
from ...scrapers import *
|
from ...scrapers import *
|
||||||
|
from settings import *
|
||||||
|
|
||||||
def generate_filename() -> str:
|
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
||||||
return f"JobSpy_results_{timestamp}.csv"
|
|
||||||
|
|
||||||
|
|
||||||
class CSVFormatter:
|
class CSVFormatter:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def format(jobs: ScraperResponse) -> StringIO:
|
def upload_to_google_sheet(csv_data: str):
|
||||||
|
try:
|
||||||
|
scope = [
|
||||||
|
"https://www.googleapis.com/auth/spreadsheets",
|
||||||
|
"https://www.googleapis.com/auth/drive.file",
|
||||||
|
"https://www.googleapis.com/auth/drive",
|
||||||
|
]
|
||||||
|
credentials = ServiceAccountCredentials.from_json_keyfile_name(
|
||||||
|
GSHEET_JSON_KEY_PATH, scope
|
||||||
|
)
|
||||||
|
gc = gspread.authorize(credentials)
|
||||||
|
sh = gc.open(GSHEET_NAME)
|
||||||
|
|
||||||
|
worksheet = sh.get_worksheet(0)
|
||||||
|
data_string = csv_data.getvalue()
|
||||||
|
reader = csv.reader(StringIO(data_string))
|
||||||
|
|
||||||
|
rows = list(reader)
|
||||||
|
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
if i == 0:
|
||||||
|
continue
|
||||||
|
worksheet.append_row(row)
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def generate_filename() -> str:
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
return f"JobSpy_results_{timestamp}.csv"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def format(jobs: CommonResponse) -> StringIO:
|
||||||
"""
|
"""
|
||||||
Transfomr the jobs objects into csv
|
Transfomr the jobs objects into csv
|
||||||
:param jobs:
|
:param jobs:
|
||||||
|
@ -41,7 +72,7 @@ class CSVFormatter:
|
||||||
writer.writerow(headers)
|
writer.writerow(headers)
|
||||||
|
|
||||||
for site, job_response in jobs.dict().items():
|
for site, job_response in jobs.dict().items():
|
||||||
if job_response and job_response.get("success"):
|
if isinstance(job_response, dict) and job_response.get("success"):
|
||||||
for job in job_response["jobs"]:
|
for job in job_response["jobs"]:
|
||||||
writer.writerow(
|
writer.writerow(
|
||||||
[
|
[
|
||||||
|
|
|
@ -55,12 +55,13 @@ class JobResponse(BaseModel):
|
||||||
success: bool
|
success: bool
|
||||||
error: str = None
|
error: str = None
|
||||||
|
|
||||||
|
total_results: int = None
|
||||||
|
|
||||||
jobs: list[JobPost] = []
|
jobs: list[JobPost] = []
|
||||||
|
|
||||||
total_results: int = None
|
|
||||||
returned_results: int = None
|
returned_results: int = None
|
||||||
|
|
||||||
@validator("returned_results")
|
@validator("returned_results", pre=True, always=True)
|
||||||
def set_returned_results(cls, v, values):
|
def set_returned_results(cls, v, values):
|
||||||
if v is None and values.get("jobs"):
|
if v is None and values.get("jobs"):
|
||||||
return len(values["jobs"])
|
return len(values["jobs"])
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from ..jobs import *
|
from ..jobs import *
|
||||||
from ..formatters import OutputFormat
|
from ..formatters import OutputFormat
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional, Any
|
||||||
|
|
||||||
|
|
||||||
class StatusException(Exception):
|
class StatusException(Exception):
|
||||||
|
@ -28,10 +28,12 @@ class ScraperInput(BaseModel):
|
||||||
results_wanted: int = 15
|
results_wanted: int = 15
|
||||||
|
|
||||||
|
|
||||||
class ScraperResponse(BaseModel):
|
class CommonResponse(BaseModel):
|
||||||
linkedin: Optional[JobResponse]
|
status: Optional[str]
|
||||||
indeed: Optional[JobResponse]
|
error: Optional[str]
|
||||||
zip_recruiter: Optional[JobResponse]
|
linkedin: Optional[Any] = None
|
||||||
|
indeed: Optional[Any] = None
|
||||||
|
zip_recruiter: Optional[Any] = None
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
|
|
|
@ -6,13 +6,13 @@ from concurrent.futures import ThreadPoolExecutor
|
||||||
from api.core.scrapers.indeed import IndeedScraper
|
from api.core.scrapers.indeed import IndeedScraper
|
||||||
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from api.core.scrapers.linkedin import LinkedInScraper
|
from api.core.scrapers.linkedin import LinkedInScraper
|
||||||
from api.core.formatters.csv import CSVFormatter, generate_filename
|
from api.core.formatters.csv import CSVFormatter
|
||||||
from api.core.scrapers import (
|
from api.core.scrapers import (
|
||||||
ScraperInput,
|
ScraperInput,
|
||||||
Site,
|
Site,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
OutputFormat,
|
OutputFormat,
|
||||||
ScraperResponse,
|
CommonResponse,
|
||||||
)
|
)
|
||||||
from typing import List, Dict, Tuple, Union
|
from typing import List, Dict, Tuple, Union
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ SCRAPER_MAPPING = {
|
||||||
|
|
||||||
|
|
||||||
@router.post("/")
|
@router.post("/")
|
||||||
async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
|
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrapes job data from multiple job sites.
|
Asynchronously scrapes job data from multiple job sites.
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
|
@ -42,14 +42,26 @@ async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
||||||
|
|
||||||
scraper_response = ScraperResponse(**results)
|
scraper_response = CommonResponse(status="JSON response success", **results)
|
||||||
|
|
||||||
if scraper_input.output_format == OutputFormat.CSV:
|
if scraper_input.output_format == OutputFormat.CSV:
|
||||||
csv_output = CSVFormatter.format(scraper_response)
|
csv_output = CSVFormatter.format(scraper_response)
|
||||||
response = StreamingResponse(csv_output, media_type="text/csv")
|
response = StreamingResponse(csv_output, media_type="text/csv")
|
||||||
response.headers[
|
response.headers[
|
||||||
"Content-Disposition"
|
"Content-Disposition"
|
||||||
] = f"attachment; filename={generate_filename()}"
|
] = f"attachment; filename={CSVFormatter.generate_filename()}"
|
||||||
return response
|
return response
|
||||||
|
|
||||||
return scraper_response
|
elif scraper_input.output_format == OutputFormat.GSHEET:
|
||||||
|
csv_output = CSVFormatter.format(scraper_response)
|
||||||
|
try:
|
||||||
|
CSVFormatter.upload_to_google_sheet(csv_output)
|
||||||
|
return CommonResponse(status="Successfully uploaded to Google Sheets")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return CommonResponse(
|
||||||
|
status="Failed to upload to Google Sheet", error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return scraper_response
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,32 +1,43 @@
|
||||||
anyio==3.7.1
|
anyio==3.7.1
|
||||||
|
atomicwrites==1.4.1
|
||||||
attrs==23.1.0
|
attrs==23.1.0
|
||||||
bcrypt==4.0.1
|
bcrypt==4.0.1
|
||||||
beautifulsoup4==4.12.2
|
beautifulsoup4==4.12.2
|
||||||
|
cachetools==5.3.1
|
||||||
certifi==2023.5.7
|
certifi==2023.5.7
|
||||||
cffi==1.15.1
|
cffi==1.15.1
|
||||||
chardet==4.0.0
|
chardet==4.0.0
|
||||||
charset-normalizer==3.2.0
|
charset-normalizer==3.2.0
|
||||||
click==8.1.4
|
click==8.1.4
|
||||||
|
colorama==0.4.6
|
||||||
cryptography==41.0.1
|
cryptography==41.0.1
|
||||||
dataclasses==0.6
|
dataclasses==0.6
|
||||||
deprecation==2.1.0
|
deprecation==2.1.0
|
||||||
ecdsa==0.18.0
|
ecdsa==0.18.0
|
||||||
exceptiongroup==1.1.2
|
exceptiongroup==1.1.2
|
||||||
fastapi==0.99.1
|
fastapi==0.99.1
|
||||||
|
google-auth==2.22.0
|
||||||
|
google-auth-oauthlib==1.0.0
|
||||||
gotrue==0.2.0
|
gotrue==0.2.0
|
||||||
|
gspread==5.10.0
|
||||||
h11==0.14.0
|
h11==0.14.0
|
||||||
httpcore==0.12.3
|
httpcore==0.12.3
|
||||||
|
httplib2==0.22.0
|
||||||
httpx==0.16.1
|
httpx==0.16.1
|
||||||
idna==2.10
|
idna==2.10
|
||||||
iniconfig==2.0.0
|
iniconfig==2.0.0
|
||||||
|
oauth2client==4.1.3
|
||||||
|
oauthlib==3.2.2
|
||||||
packaging==23.1
|
packaging==23.1
|
||||||
passlib==1.7.4
|
passlib==1.7.4
|
||||||
pluggy==1.2.0
|
pluggy==1.2.0
|
||||||
postgrest-py==0.4.0
|
postgrest-py==0.4.0
|
||||||
py==1.11.0
|
py==1.11.0
|
||||||
pyasn1==0.5.0
|
pyasn1==0.5.0
|
||||||
|
pyasn1-modules==0.3.0
|
||||||
pycparser==2.21
|
pycparser==2.21
|
||||||
pydantic==1.10.11
|
pydantic==1.10.11
|
||||||
|
pyparsing==3.1.1
|
||||||
pytest==6.2.5
|
pytest==6.2.5
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
python-dotenv==1.0.0
|
python-dotenv==1.0.0
|
||||||
|
@ -34,6 +45,7 @@ python-jose==3.3.0
|
||||||
python-multipart==0.0.6
|
python-multipart==0.0.6
|
||||||
realtime-py==0.1.3
|
realtime-py==0.1.3
|
||||||
requests==2.25.1
|
requests==2.25.1
|
||||||
|
requests-oauthlib==1.3.1
|
||||||
rfc3986==1.5.0
|
rfc3986==1.5.0
|
||||||
rsa==4.9
|
rsa==4.9
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
|
|
|
@ -2,9 +2,14 @@ from dotenv import load_dotenv
|
||||||
import os
|
import os
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
# gsheets (template to copy at https://docs.google.com/spreadsheets/d/1HAnn-aPv-BO4QTEzfIWc-5iw50duyMoTgX8o3RsEOWs/edit?usp=sharing)
|
||||||
|
GSHEET_JSON_KEY_PATH = "client_secret.json"
|
||||||
|
GSHEET_NAME = "JobSpy"
|
||||||
|
|
||||||
|
# optional autha
|
||||||
|
AUTH_REQUIRED = False
|
||||||
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
||||||
SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
|
SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
|
||||||
JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY")
|
JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY")
|
||||||
ALGORITHM = "HS256"
|
|
||||||
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
||||||
AUTH_REQUIRED = False
|
ALGORITHM = "HS256"
|
||||||
|
|
Loading…
Reference in New Issue