Google sheets integration (#22)

pull/23/head^2
Cullen Watson 2023-08-27 20:32:46 -05:00 committed by GitHub
parent d10dce6913
commit 65bfcb14d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 186 additions and 36 deletions

View File

@ -63,11 +63,13 @@ jobs:
- name: Check error field in response - name: Check error field in response
run: | run: |
global_error=$(jq '.error' response.json)
indeed_error=$(jq '.indeed.error' response.json) indeed_error=$(jq '.indeed.error' response.json)
linkedin_error=$(jq '.linkedin.error' response.json) linkedin_error=$(jq '.linkedin.error' response.json)
if [[ "$indeed_error" != "null" || "$linkedin_error" != "null" ]]; then if [[ "$indeed_error" != "null" || "$linkedin_error" != "null" ]]; then
echo "Error found in response:" echo "Error found in response:"
echo "Global Error: $global_error"
echo "Indeed Error: $indeed_error" echo "Indeed Error: $indeed_error"
echo "LinkedIn Error: $linkedin_error" echo "LinkedIn Error: $linkedin_error"
exit 1 exit 1

3
.gitignore vendored
View File

@ -4,4 +4,5 @@
/ven/ /ven/
**/__pycache__/ **/__pycache__/
*.pyc *.pyc
.env .env
client_secret.json

View File

@ -4,8 +4,10 @@
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
- Returns jobs as JSON or CSV with title, location, company, description & other data - Returns jobs as JSON or CSV with title, location, company, description & other data
- Imports directly into **Google Sheets**
- Optional JWT authorization - Optional JWT authorization
![jobspy_gsheet](https://github.com/cullenwatson/JobSpy/assets/78247585/9f0a997c-4e33-4167-b04e-31ab1f606edb)
### API ### API
@ -23,7 +25,7 @@ Optional
├── is_remote (bool) ├── is_remote (bool)
├── results_wanted (int): per site_type ├── results_wanted (int): per site_type
├── easy_apply (bool): only for linkedin ├── easy_apply (bool): only for linkedin
└── output_format (enum): json, csv └── output_format (enum): json, csv, gsheet
``` ```
### Request Example ### Request Example
@ -34,6 +36,7 @@ Optional
"distance": 10, "distance": 10,
"job_type": "fulltime", "job_type": "fulltime",
"results_wanted": 15 "results_wanted": 15
"output_format": "gsheet"
``` ```
### Response Schema ### Response Schema
@ -63,7 +66,16 @@ JobResponse
├── total_results (int) ├── total_results (int)
└── returned_results (int) └── returned_results (int)
``` ```
### Response Example (GOOGLE SHEETS)
```json
{
"status": "Successfully uploaded to Google Sheets",
"error": null,
"linkedin": null,
"indeed": null,
"zip_recruiter": null
}
```
### Response Example (JSON) ### Response Example (JSON)
```json ```json
{ {
@ -132,15 +144,33 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
4. Run the server with `uvicorn main:app --reload` 4. Run the server with `uvicorn main:app --reload`
## Usage ## Usage
### Google Sheets Integration (Optional)
### Swagger UI: #### Obtaining an Access Key : [Video Guide](https://www.youtube.com/watch?v=w533wJuilao)
To interact with the API documentation, navigate to [localhost:8000/docs](http://localhost:8000/docs). * Enable the [Google Sheets & Google Drive API](https://console.cloud.google.com/)
* Create credentials -> service account -> create & continue
* Select role -> basic: editor -> done
* Click on the email you just created in the service account list
* Go to the Keys tab -> add key -> create new key -> JSON -> Create
### Postman: #### Using the key in the repo
* Copy the key file into the JobSpy repo as `/client_secret.json`
* Go to [my template sheet](https://docs.google.com/spreadsheets/d/1HAnn-aPv-BO4QTEzfIWc-5iw50duyMoTgX8o3RsEOWs/edit?usp=sharing) & save as a copy into your account
* Share the sheet with the email from the service account above with editor rights
* If you changed the name of the sheet, put the name in `GSHEET_NAME` in `/settings.py`
### How to call the API
#### [Postman](https://www.postman.com/downloads/) (preferred):
To use Postman: To use Postman:
1. Locate the files in the `/postman/` directory. 1. Locate the files in the `/postman/` directory.
2. Import the Postman collection and environment JSON files. 2. Import the Postman collection and environment JSON files.
#### Swagger UI:
Or you can call the API with the interactive documentation at [localhost:8000/docs](http://localhost:8000/docs).
## FAQ ## FAQ
### I'm having issues with my queries. What should I do? ### I'm having issues with my queries. What should I do?

View File

@ -4,11 +4,9 @@ from jose import jwt, JWTError
from fastapi import HTTPException, status, Depends from fastapi import HTTPException, status, Depends
from fastapi.security import OAuth2PasswordBearer from fastapi.security import OAuth2PasswordBearer
from settings import *
from api.core.users import TokenData from api.core.users import TokenData
from api.auth.db_utils import UserInDB, get_user from api.auth.db_utils import UserInDB, get_user
load_dotenv()
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token") oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token")

View File

@ -4,3 +4,4 @@ from enum import Enum
class OutputFormat(Enum): class OutputFormat(Enum):
CSV = "csv" CSV = "csv"
JSON = "json" JSON = "json"
GSHEET = "gsheet"

View File

@ -1,19 +1,50 @@
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import csv import csv
from io import StringIO from io import StringIO
from datetime import datetime from datetime import datetime
from ...jobs import * from ...jobs import *
from ...scrapers import * from ...scrapers import *
from settings import *
def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
class CSVFormatter: class CSVFormatter:
@staticmethod @staticmethod
def format(jobs: ScraperResponse) -> StringIO: def upload_to_google_sheet(csv_data: str):
try:
scope = [
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive.file",
"https://www.googleapis.com/auth/drive",
]
credentials = ServiceAccountCredentials.from_json_keyfile_name(
GSHEET_JSON_KEY_PATH, scope
)
gc = gspread.authorize(credentials)
sh = gc.open(GSHEET_NAME)
worksheet = sh.get_worksheet(0)
data_string = csv_data.getvalue()
reader = csv.reader(StringIO(data_string))
rows = list(reader)
for i, row in enumerate(rows):
if i == 0:
continue
worksheet.append_row(row)
except Exception as e:
raise e
@staticmethod
def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
@staticmethod
def format(jobs: CommonResponse) -> StringIO:
""" """
Transfomr the jobs objects into csv Transfomr the jobs objects into csv
:param jobs: :param jobs:
@ -41,7 +72,7 @@ class CSVFormatter:
writer.writerow(headers) writer.writerow(headers)
for site, job_response in jobs.dict().items(): for site, job_response in jobs.dict().items():
if job_response and job_response.get("success"): if isinstance(job_response, dict) and job_response.get("success"):
for job in job_response["jobs"]: for job in job_response["jobs"]:
writer.writerow( writer.writerow(
[ [

View File

@ -55,12 +55,13 @@ class JobResponse(BaseModel):
success: bool success: bool
error: str = None error: str = None
total_results: int = None
jobs: list[JobPost] = [] jobs: list[JobPost] = []
total_results: int = None
returned_results: int = None returned_results: int = None
@validator("returned_results") @validator("returned_results", pre=True, always=True)
def set_returned_results(cls, v, values): def set_returned_results(cls, v, values):
if v is None and values.get("jobs"): if v is None and values.get("jobs"):
return len(values["jobs"]) return len(values["jobs"])

View File

@ -1,6 +1,6 @@
from ..jobs import * from ..jobs import *
from ..formatters import OutputFormat from ..formatters import OutputFormat
from typing import List, Dict, Optional from typing import List, Dict, Optional, Any
class StatusException(Exception): class StatusException(Exception):
@ -28,10 +28,12 @@ class ScraperInput(BaseModel):
results_wanted: int = 15 results_wanted: int = 15
class ScraperResponse(BaseModel): class CommonResponse(BaseModel):
linkedin: Optional[JobResponse] status: Optional[str]
indeed: Optional[JobResponse] error: Optional[str]
zip_recruiter: Optional[JobResponse] linkedin: Optional[Any] = None
indeed: Optional[Any] = None
zip_recruiter: Optional[Any] = None
class Scraper: class Scraper:

View File

@ -6,13 +6,13 @@ from concurrent.futures import ThreadPoolExecutor
from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.indeed import IndeedScraper
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
from api.core.scrapers.linkedin import LinkedInScraper from api.core.scrapers.linkedin import LinkedInScraper
from api.core.formatters.csv import CSVFormatter, generate_filename from api.core.formatters.csv import CSVFormatter
from api.core.scrapers import ( from api.core.scrapers import (
ScraperInput, ScraperInput,
Site, Site,
JobResponse, JobResponse,
OutputFormat, OutputFormat,
ScraperResponse, CommonResponse,
) )
from typing import List, Dict, Tuple, Union from typing import List, Dict, Tuple, Union
@ -26,7 +26,7 @@ SCRAPER_MAPPING = {
@router.post("/") @router.post("/")
async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse: async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
""" """
Asynchronously scrapes job data from multiple job sites. Asynchronously scrapes job data from multiple job sites.
:param scraper_input: :param scraper_input:
@ -42,14 +42,26 @@ async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
results = dict(executor.map(scrape_site, scraper_input.site_type)) results = dict(executor.map(scrape_site, scraper_input.site_type))
scraper_response = ScraperResponse(**results) scraper_response = CommonResponse(status="JSON response success", **results)
if scraper_input.output_format == OutputFormat.CSV: if scraper_input.output_format == OutputFormat.CSV:
csv_output = CSVFormatter.format(scraper_response) csv_output = CSVFormatter.format(scraper_response)
response = StreamingResponse(csv_output, media_type="text/csv") response = StreamingResponse(csv_output, media_type="text/csv")
response.headers[ response.headers[
"Content-Disposition" "Content-Disposition"
] = f"attachment; filename={generate_filename()}" ] = f"attachment; filename={CSVFormatter.generate_filename()}"
return response return response
return scraper_response elif scraper_input.output_format == OutputFormat.GSHEET:
csv_output = CSVFormatter.format(scraper_response)
try:
CSVFormatter.upload_to_google_sheet(csv_output)
return CommonResponse(status="Successfully uploaded to Google Sheets")
except Exception as e:
return CommonResponse(
status="Failed to upload to Google Sheet", error=str(e)
)
else:
return scraper_response

File diff suppressed because one or more lines are too long

View File

@ -1,32 +1,43 @@
anyio==3.7.1 anyio==3.7.1
atomicwrites==1.4.1
attrs==23.1.0 attrs==23.1.0
bcrypt==4.0.1 bcrypt==4.0.1
beautifulsoup4==4.12.2 beautifulsoup4==4.12.2
cachetools==5.3.1
certifi==2023.5.7 certifi==2023.5.7
cffi==1.15.1 cffi==1.15.1
chardet==4.0.0 chardet==4.0.0
charset-normalizer==3.2.0 charset-normalizer==3.2.0
click==8.1.4 click==8.1.4
colorama==0.4.6
cryptography==41.0.1 cryptography==41.0.1
dataclasses==0.6 dataclasses==0.6
deprecation==2.1.0 deprecation==2.1.0
ecdsa==0.18.0 ecdsa==0.18.0
exceptiongroup==1.1.2 exceptiongroup==1.1.2
fastapi==0.99.1 fastapi==0.99.1
google-auth==2.22.0
google-auth-oauthlib==1.0.0
gotrue==0.2.0 gotrue==0.2.0
gspread==5.10.0
h11==0.14.0 h11==0.14.0
httpcore==0.12.3 httpcore==0.12.3
httplib2==0.22.0
httpx==0.16.1 httpx==0.16.1
idna==2.10 idna==2.10
iniconfig==2.0.0 iniconfig==2.0.0
oauth2client==4.1.3
oauthlib==3.2.2
packaging==23.1 packaging==23.1
passlib==1.7.4 passlib==1.7.4
pluggy==1.2.0 pluggy==1.2.0
postgrest-py==0.4.0 postgrest-py==0.4.0
py==1.11.0 py==1.11.0
pyasn1==0.5.0 pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21 pycparser==2.21
pydantic==1.10.11 pydantic==1.10.11
pyparsing==3.1.1
pytest==6.2.5 pytest==6.2.5
python-dateutil==2.8.2 python-dateutil==2.8.2
python-dotenv==1.0.0 python-dotenv==1.0.0
@ -34,6 +45,7 @@ python-jose==3.3.0
python-multipart==0.0.6 python-multipart==0.0.6
realtime-py==0.1.3 realtime-py==0.1.3
requests==2.25.1 requests==2.25.1
requests-oauthlib==1.3.1
rfc3986==1.5.0 rfc3986==1.5.0
rsa==4.9 rsa==4.9
six==1.16.0 six==1.16.0

View File

@ -2,9 +2,14 @@ from dotenv import load_dotenv
import os import os
load_dotenv() load_dotenv()
# gsheets (template to copy at https://docs.google.com/spreadsheets/d/1HAnn-aPv-BO4QTEzfIWc-5iw50duyMoTgX8o3RsEOWs/edit?usp=sharing)
GSHEET_JSON_KEY_PATH = "client_secret.json"
GSHEET_NAME = "JobSpy"
# optional autha
AUTH_REQUIRED = False
SUPABASE_URL = os.environ.get("SUPABASE_URL") SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_KEY") SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY") JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY")
ALGORITHM = "HS256"
ACCESS_TOKEN_EXPIRE_MINUTES = 60 ACCESS_TOKEN_EXPIRE_MINUTES = 60
AUTH_REQUIRED = False ALGORITHM = "HS256"