From 7d54b98cafc5f7a62b18704b462766efe8d65d87 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 9 Jul 2023 15:15:39 -0500 Subject: [PATCH 01/18] feat: add auth to jobs endpoint --- .gitignore | 1 + api/__init__.py | 5 +++- api/core/users/__init__.py | 20 +++++++++++++++ api/v1/__init__.py | 2 ++ api/v1/jobs/__init__.py | 5 ++-- api/v1/token/__init__.py | 23 ++++++++++++++++++ api/v1/token/auth.py | 50 ++++++++++++++++++++++++++++++++++++++ api/v1/token/db_utils.py | 37 ++++++++++++++++++++++++++++ main.py | 14 +++++++---- requirements.txt | 8 ++++-- settings.py | 9 +++++++ 11 files changed, 164 insertions(+), 10 deletions(-) create mode 100644 api/core/users/__init__.py create mode 100644 api/v1/token/__init__.py create mode 100644 api/v1/token/auth.py create mode 100644 api/v1/token/db_utils.py create mode 100644 settings.py diff --git a/.gitignore b/.gitignore index d895cdb..98e943c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ /ven/ **/__pycache__/ *.pyc +.env \ No newline at end of file diff --git a/api/__init__.py b/api/__init__.py index 3479834..02838e3 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -1,5 +1,8 @@ from fastapi import APIRouter from .v1 import router as v1_router -router = APIRouter(prefix="/api", tags=["api"]) +router = APIRouter( + prefix="/api", + tags=["api"], +) router.include_router(v1_router) diff --git a/api/core/users/__init__.py b/api/core/users/__init__.py new file mode 100644 index 0000000..5b33b94 --- /dev/null +++ b/api/core/users/__init__.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel + +class User(BaseModel): + username: str + full_name: str + email: str + disabled: bool + + +class UserInDB(User): + hashed_password: str + + +class TokenData(BaseModel): + username: str + + +class Token(BaseModel): + access_token: str + token_type: str diff --git a/api/v1/__init__.py b/api/v1/__init__.py index ea42d12..33f258d 100644 --- a/api/v1/__init__.py +++ b/api/v1/__init__.py @@ -1,5 +1,7 @@ from fastapi import APIRouter from .jobs import router as jobs_router +from .token import router as token_router router = APIRouter(prefix="/v1") router.include_router(jobs_router) +router.include_router(token_router) diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index 2a87cc6..5ba1ec9 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -1,11 +1,12 @@ -from fastapi import APIRouter +from fastapi import APIRouter, Depends from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper from api.core.scrapers.linkedin import LinkedInScraper from api.core.scrapers import ScraperInput, Site +from ...v1.token.auth import get_active_current_user -router = APIRouter(prefix="/jobs") +router = APIRouter(prefix="/jobs", dependencies=[Depends(get_active_current_user)]) SCRAPER_MAPPING = { Site.LINKEDIN: LinkedInScraper, diff --git a/api/v1/token/__init__.py b/api/v1/token/__init__.py new file mode 100644 index 0000000..e3eebf3 --- /dev/null +++ b/api/v1/token/__init__.py @@ -0,0 +1,23 @@ +from fastapi import APIRouter, Depends, HTTPException, status +from fastapi.security import OAuth2PasswordRequestForm + +from api.core.users import Token +from .db_utils import authenticate_user +from .auth import create_access_token + +ACCESS_TOKEN_EXPIRE_MINUTES = 30 +router = APIRouter(prefix="/token") + + +@router.post("/", response_model=Token) +async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): + user = authenticate_user(form_data.username, form_data.password) + if user is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password", + headers={"WWW-Authenticate": "Bearer"}, + ) + + access_token = create_access_token(data={"sub": user.username}) + return {"access_token": access_token, "token_type": "bearer"} diff --git a/api/v1/token/auth.py b/api/v1/token/auth.py new file mode 100644 index 0000000..8cf6fd6 --- /dev/null +++ b/api/v1/token/auth.py @@ -0,0 +1,50 @@ +from datetime import datetime, timedelta + +from jose import jwt, JWTError +from fastapi import HTTPException, status, Depends +from fastapi.security import OAuth2PasswordBearer + +from settings import * +from api.core.users import TokenData +from .db_utils import UserInDB, get_user + +load_dotenv() +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/token") + + +def create_access_token(data: dict): + print(JWT_SECRET_KEY) + to_encode = data.copy() + expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES) + to_encode.update({"exp": expire}) + encoded_jwt = jwt.encode(to_encode, JWT_SECRET_KEY, algorithm=ALGORITHM) + return encoded_jwt + + +async def get_current_user(token: str = Depends(oauth2_scheme)): + credential_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + try: + payload = jwt.decode(token, JWT_SECRET_KEY, algorithms=[ALGORITHM]) + username: str = payload.get("sub") + if username is None: + raise credential_exception + token_data = TokenData(username=username) + except JWTError: + raise credential_exception + + current_user = get_user(token_data.username) + if current_user is None: + raise credential_exception + return current_user + + +async def get_active_current_user(current_user: UserInDB = Depends(get_current_user)): + if current_user.disabled: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, detail="Inactive user." + ) + return current_user diff --git a/api/v1/token/db_utils.py b/api/v1/token/db_utils.py new file mode 100644 index 0000000..04fe8af --- /dev/null +++ b/api/v1/token/db_utils.py @@ -0,0 +1,37 @@ +from passlib.context import CryptContext + +from supabase_py import create_client, Client +from api.core.users import UserInDB +from settings import SUPABASE_URL, SUPABASE_KEY + +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") +supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) + +def get_user(username: str): + result = supabase.table('users').select().eq('username', username).execute() + + if 'error' in result and result['error']: + print(f"Error: {result['error']['message']}") + return None + else: + if result['data']: + user_data = result['data'][0] # get the first (and should be only) user with the matching username + return UserInDB(**user_data) + else: + return None + +def verify_password(password: str, hashed_password: str): + return pwd_context.verify(password, hashed_password) + + +def get_password_hash(password): + return pwd_context.hash(password) + + +def authenticate_user(username: str, password: str): + user = get_user(username) + if not user: + return False + if not verify_password(password, user.hashed_password): + return False + return user diff --git a/main.py b/main.py index 2028fb9..f87c3d1 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,15 @@ from fastapi import FastAPI +from supabase_py import create_client, Client from api import router as api_router -app = FastAPI() +app = FastAPI( + title="JobSpy Backend", + description="Endpoints for job board scrapers", + version="1.0.0", +) app.include_router(api_router) - -@app.get("/") -async def root(): - return {"message": "JobSpy Backend"} +@app.get("/", tags=["health"]) +async def health_check(): + return {"message": "JobSpy ready to scrape"} diff --git a/requirements.txt b/requirements.txt index dddfd55..b48c7e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,11 @@ fastapi~=0.99.1 pydantic~=1.10.11 beautifulsoup4~=4.12.2 -requests~=2.31.0 +requests pip~=21.3.1 wheel~=0.37.1 -setuptools~=60.2.0 \ No newline at end of file +setuptools~=60.2.0 +passlib~=1.7.4 +cryptography~=41.0.1 +python-jose~=3.3.0 +python-dotenv~=1.0.0 \ No newline at end of file diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..39b21eb --- /dev/null +++ b/settings.py @@ -0,0 +1,9 @@ +from dotenv import load_dotenv +import os + +load_dotenv() +SUPABASE_URL = os.environ.get("SUPABASE_URL") +SUPABASE_KEY = os.environ.get("SUPABASE_KEY") +JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY") +ALGORITHM = "HS256" +ACCESS_TOKEN_EXPIRE_MINUTES = 30 From adf4cbae3117c412f1eaa6c5c5ef44d75f8dc5ad Mon Sep 17 00:00:00 2001 From: Cullen Date: Sun, 9 Jul 2023 18:42:44 -0500 Subject: [PATCH 02/18] feat(users): add register route --- api/__init__.py | 2 + api/auth/__init__.py | 8 +++ api/{v1/token/auth.py => auth/auth_utils.py} | 4 +- api/{v1/token => auth}/db_utils.py | 28 +++++++-- api/auth/register/__init__.py | 28 +++++++++ api/{v1 => auth}/token/__init__.py | 9 ++- api/core/users/__init__.py | 10 +++- api/v1/__init__.py | 7 ++- api/v1/jobs/__init__.py | 3 +- main.py | 1 + requirements.txt | 60 ++++++++++++++++---- 11 files changed, 130 insertions(+), 30 deletions(-) create mode 100644 api/auth/__init__.py rename api/{v1/token/auth.py => auth/auth_utils.py} (93%) rename api/{v1/token => auth}/db_utils.py (59%) create mode 100644 api/auth/register/__init__.py rename api/{v1 => auth}/token/__init__.py (80%) diff --git a/api/__init__.py b/api/__init__.py index 02838e3..be2fc6b 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -1,4 +1,5 @@ from fastapi import APIRouter +from api.auth import router as auth_router from .v1 import router as v1_router router = APIRouter( @@ -6,3 +7,4 @@ router = APIRouter( tags=["api"], ) router.include_router(v1_router) +router.include_router(auth_router) diff --git a/api/auth/__init__.py b/api/auth/__init__.py new file mode 100644 index 0000000..269898e --- /dev/null +++ b/api/auth/__init__.py @@ -0,0 +1,8 @@ +from fastapi import APIRouter + +from api.auth.token import router as token_router +from api.auth.register import router as register_router + +router = APIRouter(prefix="/auth") +router.include_router(token_router) +router.include_router(register_router) diff --git a/api/v1/token/auth.py b/api/auth/auth_utils.py similarity index 93% rename from api/v1/token/auth.py rename to api/auth/auth_utils.py index 8cf6fd6..12710a8 100644 --- a/api/v1/token/auth.py +++ b/api/auth/auth_utils.py @@ -6,10 +6,10 @@ from fastapi.security import OAuth2PasswordBearer from settings import * from api.core.users import TokenData -from .db_utils import UserInDB, get_user +from api.auth.db_utils import UserInDB, get_user load_dotenv() -oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/token") +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token") def create_access_token(data: dict): diff --git a/api/v1/token/db_utils.py b/api/auth/db_utils.py similarity index 59% rename from api/v1/token/db_utils.py rename to api/auth/db_utils.py index 04fe8af..339e540 100644 --- a/api/v1/token/db_utils.py +++ b/api/auth/db_utils.py @@ -1,25 +1,41 @@ from passlib.context import CryptContext - from supabase_py import create_client, Client +from fastapi import HTTPException, status + from api.core.users import UserInDB from settings import SUPABASE_URL, SUPABASE_KEY pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) -def get_user(username: str): - result = supabase.table('users').select().eq('username', username).execute() - if 'error' in result and result['error']: +def create_user(user_create: UserInDB): + result = supabase.table("users").insert(user_create.dict()).execute() + print(f"Insert result: {result}") + + if "error" in result and result["error"]: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"User could not be created due to {result['error']['message']}", + ) + + return result + + +def get_user(username: str): + result = supabase.table("users").select().eq("username", username).execute() + + if "error" in result and result["error"]: print(f"Error: {result['error']['message']}") return None else: - if result['data']: - user_data = result['data'][0] # get the first (and should be only) user with the matching username + if result["data"]: + user_data = result["data"][0] return UserInDB(**user_data) else: return None + def verify_password(password: str, hashed_password: str): return pwd_context.verify(password, hashed_password) diff --git a/api/auth/register/__init__.py b/api/auth/register/__init__.py new file mode 100644 index 0000000..1e140ca --- /dev/null +++ b/api/auth/register/__init__.py @@ -0,0 +1,28 @@ +from fastapi import APIRouter, HTTPException, status +from api.core.users import UserCreate, UserInDB +from api.auth.db_utils import get_user, get_password_hash, create_user + +router = APIRouter(prefix="/register", tags=["register"]) + + +@router.post("/") +async def register_new_user(user: UserCreate): + existing_user = get_user(user.username) + if existing_user is not None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Username already exists", + ) + + hashed_password = get_password_hash(user.password) + print(f"Hashed password: {hashed_password}") + user_create = UserInDB( + username=user.username, + email=user.email, + full_name=user.full_name, + hashed_password=hashed_password, + disabled=False, + ) + create_user(user_create) + + return {"detail": "User created successfully"} diff --git a/api/v1/token/__init__.py b/api/auth/token/__init__.py similarity index 80% rename from api/v1/token/__init__.py rename to api/auth/token/__init__.py index e3eebf3..9f73f1c 100644 --- a/api/v1/token/__init__.py +++ b/api/auth/token/__init__.py @@ -2,17 +2,16 @@ from fastapi import APIRouter, Depends, HTTPException, status from fastapi.security import OAuth2PasswordRequestForm from api.core.users import Token -from .db_utils import authenticate_user -from .auth import create_access_token +from api.auth.db_utils import authenticate_user +from api.auth.auth_utils import create_access_token -ACCESS_TOKEN_EXPIRE_MINUTES = 30 -router = APIRouter(prefix="/token") +router = APIRouter(prefix="/token", tags=["token"]) @router.post("/", response_model=Token) async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): user = authenticate_user(form_data.username, form_data.password) - if user is None: + if not user: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Incorrect username or password", diff --git a/api/core/users/__init__.py b/api/core/users/__init__.py index 5b33b94..55f7e8f 100644 --- a/api/core/users/__init__.py +++ b/api/core/users/__init__.py @@ -1,10 +1,18 @@ from pydantic import BaseModel + class User(BaseModel): username: str full_name: str email: str - disabled: bool + disabled: bool = False + + +class UserCreate(BaseModel): + username: str + full_name: str + email: str + password: str class UserInDB(User): diff --git a/api/v1/__init__.py b/api/v1/__init__.py index 33f258d..63eb1a7 100644 --- a/api/v1/__init__.py +++ b/api/v1/__init__.py @@ -1,7 +1,8 @@ -from fastapi import APIRouter +from fastapi import APIRouter, Depends from .jobs import router as jobs_router -from .token import router as token_router +from api.auth.token import router as token_router +from api.auth.auth_utils import get_active_current_user -router = APIRouter(prefix="/v1") +router = APIRouter(prefix="/v1", dependencies=[Depends(get_active_current_user)]) router.include_router(jobs_router) router.include_router(token_router) diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index 5ba1ec9..e79c4bf 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -4,9 +4,8 @@ from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper from api.core.scrapers.linkedin import LinkedInScraper from api.core.scrapers import ScraperInput, Site -from ...v1.token.auth import get_active_current_user -router = APIRouter(prefix="/jobs", dependencies=[Depends(get_active_current_user)]) +router = APIRouter(prefix="/jobs") SCRAPER_MAPPING = { Site.LINKEDIN: LinkedInScraper, diff --git a/main.py b/main.py index f87c3d1..14bb377 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,7 @@ app = FastAPI( ) app.include_router(api_router) + @app.get("/", tags=["health"]) async def health_check(): return {"message": "JobSpy ready to scrape"} diff --git a/requirements.txt b/requirements.txt index b48c7e6..a8e0f11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,49 @@ -fastapi~=0.99.1 -pydantic~=1.10.11 -beautifulsoup4~=4.12.2 -requests -pip~=21.3.1 -wheel~=0.37.1 -setuptools~=60.2.0 -passlib~=1.7.4 -cryptography~=41.0.1 -python-jose~=3.3.0 -python-dotenv~=1.0.0 \ No newline at end of file +anyio==3.7.1 +attrs==23.1.0 +bcrypt==4.0.1 +beautifulsoup4==4.12.2 +certifi==2023.5.7 +cffi==1.15.1 +chardet==4.0.0 +charset-normalizer==3.2.0 +click==8.1.4 +cryptography==41.0.1 +dataclasses==0.6 +deprecation==2.1.0 +ecdsa==0.18.0 +exceptiongroup==1.1.2 +fastapi==0.99.1 +gotrue==0.2.0 +h11==0.14.0 +httpcore==0.12.3 +httpx==0.16.1 +idna==2.10 +iniconfig==2.0.0 +packaging==23.1 +passlib==1.7.4 +pluggy==1.2.0 +postgrest-py==0.4.0 +py==1.11.0 +pyasn1==0.5.0 +pycparser==2.21 +pydantic==1.10.11 +pytest==6.2.5 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +python-jose==3.3.0 +python-multipart==0.0.6 +realtime-py==0.1.3 +requests==2.25.1 +rfc3986==1.5.0 +rsa==4.9 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.4.1 +starlette==0.27.0 +supabase-py==0.0.2 +tls-client==0.2.1 +toml==0.10.2 +typing_extensions==4.7.1 +urllib3==1.26.16 +uvicorn==0.22.0 +websockets==9.1 From 069808c7088463ec7d40910deb82e7a333a67087 Mon Sep 17 00:00:00 2001 From: Cullen Date: Sun, 9 Jul 2023 18:48:48 -0500 Subject: [PATCH 03/18] docs: remove invalid token route --- api/v1/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/api/v1/__init__.py b/api/v1/__init__.py index 63eb1a7..15e5964 100644 --- a/api/v1/__init__.py +++ b/api/v1/__init__.py @@ -1,8 +1,6 @@ from fastapi import APIRouter, Depends from .jobs import router as jobs_router -from api.auth.token import router as token_router from api.auth.auth_utils import get_active_current_user router = APIRouter(prefix="/v1", dependencies=[Depends(get_active_current_user)]) router.include_router(jobs_router) -router.include_router(token_router) From 55f9963a943ebcc31b93917180a8dc2405556360 Mon Sep 17 00:00:00 2001 From: Cullen Date: Sun, 9 Jul 2023 18:53:29 -0500 Subject: [PATCH 04/18] chore(postman): add collection and environment files --- JobSpy.postman_collection.json | 190 ++++++++++++++++++++++++++++++++ JobSpy.postman_environment.json | 15 +++ 2 files changed, 205 insertions(+) create mode 100644 JobSpy.postman_collection.json create mode 100644 JobSpy.postman_environment.json diff --git a/JobSpy.postman_collection.json b/JobSpy.postman_collection.json new file mode 100644 index 0000000..a7f2538 --- /dev/null +++ b/JobSpy.postman_collection.json @@ -0,0 +1,190 @@ +{ + "info": { + "_postman_id": "a0aa8829-fd18-47fa-a50d-03c8193adb9a", + "name": "JobSpy", + "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json", + "_exporter_id": "24144392" + }, + "item": [ + { + "name": "Search Jobs", + "request": { + "auth": { + "type": "bearer", + "bearer": [ + { + "key": "token", + "value": "{{access_token}}", + "type": "string" + } + ] + }, + "method": "GET", + "header": [], + "url": { + "raw": "http://127.0.0.1:8000/api/v1/jobs?site_type=indeed&search_term=software intern&location=dallas&page=2", + "protocol": "http", + "host": [ + "127", + "0", + "0", + "1" + ], + "port": "8000", + "path": [ + "api", + "v1", + "jobs" + ], + "query": [ + { + "key": "site_type", + "value": "indeed" + }, + { + "key": "search_term", + "value": "software intern" + }, + { + "key": "location", + "value": "dallas" + }, + { + "key": "page", + "value": "2" + } + ] + } + }, + "response": [] + }, + { + "name": "Health", + "request": { + "auth": { + "type": "bearer", + "bearer": [ + { + "key": "token", + "value": "{{access_token}}", + "type": "string" + } + ] + }, + "method": "GET", + "header": [], + "url": { + "raw": "http://127.0.0.1:8000/", + "protocol": "http", + "host": [ + "127", + "0", + "0", + "1" + ], + "port": "8000", + "path": [ + "" + ] + } + }, + "response": [] + }, + { + "name": "Token", + "event": [ + { + "listen": "test", + "script": { + "exec": [ + "var jsonData = JSON.parse(responseBody);", + "postman.setEnvironmentVariable(\"access_token\", jsonData.access_token)" + ], + "type": "text/javascript" + } + } + ], + "request": { + "method": "POST", + "header": [], + "body": { + "mode": "urlencoded", + "urlencoded": [ + { + "key": "username", + "value": "cwatson", + "type": "text" + }, + { + "key": "password", + "value": "mypass", + "type": "text" + } + ] + }, + "url": { + "raw": "http://127.0.0.1:8000/api/auth/token", + "protocol": "http", + "host": [ + "127", + "0", + "0", + "1" + ], + "port": "8000", + "path": [ + "api", + "auth", + "token" + ] + } + }, + "response": [] + }, + { + "name": "Register", + "event": [ + { + "listen": "test", + "script": { + "exec": [ + "var jsonData = JSON.parse(responseBody);", + "postman.setEnvironmentVariable(\"access_token\", jsonData.access_token)" + ], + "type": "text/javascript" + } + } + ], + "request": { + "method": "POST", + "header": [], + "body": { + "mode": "raw", + "raw": "{\n \"username\": \"cwatson\",\n \"email\": \"cgwatson@smu.edu\",\n \"password\": \"mypass\",\n \"full_name\": \"cullen watson\"\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "http://127.0.0.1:8000/api/auth/register", + "protocol": "http", + "host": [ + "127", + "0", + "0", + "1" + ], + "port": "8000", + "path": [ + "api", + "auth", + "register" + ] + } + }, + "response": [] + } + ] +} \ No newline at end of file diff --git a/JobSpy.postman_environment.json b/JobSpy.postman_environment.json new file mode 100644 index 0000000..b69ee6c --- /dev/null +++ b/JobSpy.postman_environment.json @@ -0,0 +1,15 @@ +{ + "id": "a7ea6d58-8dca-4216-97a9-224dadc1e18f", + "name": "JobSpy", + "values": [ + { + "key": "access_token", + "value": "", + "type": "any", + "enabled": true + } + ], + "_postman_variable_scope": "environment", + "_postman_exported_at": "2023-07-09T23:51:36.709Z", + "_postman_exported_using": "Postman/10.15.8" +} \ No newline at end of file From cf7bfdb474ce420d11f8a67864bacd769ef07d8b Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 16:14:05 -0500 Subject: [PATCH 05/18] feat(jobs): add distance param --- api/core/scrapers/__init__.py | 1 + api/core/scrapers/indeed/__init__.py | 1 + api/core/scrapers/linkedin/__init__.py | 6 +++++- api/core/scrapers/ziprecruiter/__init__.py | 1 + api/v1/jobs/__init__.py | 8 ++++++-- env | 3 +++ env (1) | 3 +++ 7 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 env create mode 100644 env (1) diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index d128403..d4e9546 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -12,6 +12,7 @@ class Site(Enum): class ScraperInput(BaseModel): location: str search_term: str + distance: int = 25 page: int = 1 diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index f89d8a0..b7a6605 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -26,6 +26,7 @@ class IndeedScraper(Scraper): "l": scraper_input.location, "filter": 0, "start": 0 if scraper_input.page is None else (scraper_input.page - 1) * 10, + "radius": scraper_input.distance, } response = session.get(self.url, params=params) diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 198b0dc..7ce4f93 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -16,7 +16,11 @@ class LinkedInScraper(Scraper): self.url = "https://www.linkedin.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: - params = {"pageNum": scraper_input.page - 1, "location": scraper_input.location} + params = { + "pageNum": scraper_input.page - 1, + "location": scraper_input.location, + "distance": scraper_input.distance, + } self.url = f"{self.url}/{scraper_input.search_term}-jobs" response = requests.get(self.url, params=params) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 89fd0ec..3a35747 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -25,6 +25,7 @@ class ZipRecruiterScraper(Scraper): "search": scraper_input.search_term, "location": scraper_input.location, "page": min(scraper_input.page, 10), + "radius": scraper_input.distance, } response = session.get( diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index e79c4bf..dba24a0 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -15,11 +15,15 @@ SCRAPER_MAPPING = { @router.get("/") -async def scrape_jobs(site_type: Site, search_term: str, location: str, page: int = 1): +async def scrape_jobs( + site_type: Site, search_term: str, location: str, page: int = 1, distance: int = 25 +): scraper_class = SCRAPER_MAPPING[site_type] scraper = scraper_class() - scraper_input = ScraperInput(search_term=search_term, location=location, page=page) + scraper_input = ScraperInput( + search_term=search_term, location=location, page=page, distance=distance + ) job_response = scraper.scrape(scraper_input) return job_response diff --git a/env b/env new file mode 100644 index 0000000..6c3bc8e --- /dev/null +++ b/env @@ -0,0 +1,3 @@ +JWT_SECRET_KEY=55017c787e4aa8fec8b22756200a7c56deaba6fe5154959bcbd80065ab49a906 +SUPABASE_URL=https://ipudiqgwngjlaobogjey.supabase.co +SUPABASE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImlwdWRpcWd3bmdqbGFvYm9namV5Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4ODkzMjc3NiwiZXhwIjoyMDA0NTA4Nzc2fQ.AdHkU2RaC-GScXk0g6zHzfNOJ8sY1geJWvMMgx8j900 diff --git a/env (1) b/env (1) new file mode 100644 index 0000000..6c3bc8e --- /dev/null +++ b/env (1) @@ -0,0 +1,3 @@ +JWT_SECRET_KEY=55017c787e4aa8fec8b22756200a7c56deaba6fe5154959bcbd80065ab49a906 +SUPABASE_URL=https://ipudiqgwngjlaobogjey.supabase.co +SUPABASE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImlwdWRpcWd3bmdqbGFvYm9namV5Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4ODkzMjc3NiwiZXhwIjoyMDA0NTA4Nzc2fQ.AdHkU2RaC-GScXk0g6zHzfNOJ8sY1geJWvMMgx8j900 From 9e9da7b61560802ca309e21913e367235ced759d Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 16:14:05 -0500 Subject: [PATCH 06/18] feat(jobs): add distance param --- api/core/scrapers/__init__.py | 1 + api/core/scrapers/indeed/__init__.py | 1 + api/core/scrapers/linkedin/__init__.py | 6 +++++- api/core/scrapers/ziprecruiter/__init__.py | 1 + api/v1/jobs/__init__.py | 8 ++++++-- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index d128403..d4e9546 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -12,6 +12,7 @@ class Site(Enum): class ScraperInput(BaseModel): location: str search_term: str + distance: int = 25 page: int = 1 diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index f89d8a0..b7a6605 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -26,6 +26,7 @@ class IndeedScraper(Scraper): "l": scraper_input.location, "filter": 0, "start": 0 if scraper_input.page is None else (scraper_input.page - 1) * 10, + "radius": scraper_input.distance, } response = session.get(self.url, params=params) diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 198b0dc..7ce4f93 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -16,7 +16,11 @@ class LinkedInScraper(Scraper): self.url = "https://www.linkedin.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: - params = {"pageNum": scraper_input.page - 1, "location": scraper_input.location} + params = { + "pageNum": scraper_input.page - 1, + "location": scraper_input.location, + "distance": scraper_input.distance, + } self.url = f"{self.url}/{scraper_input.search_term}-jobs" response = requests.get(self.url, params=params) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 89fd0ec..3a35747 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -25,6 +25,7 @@ class ZipRecruiterScraper(Scraper): "search": scraper_input.search_term, "location": scraper_input.location, "page": min(scraper_input.page, 10), + "radius": scraper_input.distance, } response = session.get( diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index e79c4bf..dba24a0 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -15,11 +15,15 @@ SCRAPER_MAPPING = { @router.get("/") -async def scrape_jobs(site_type: Site, search_term: str, location: str, page: int = 1): +async def scrape_jobs( + site_type: Site, search_term: str, location: str, page: int = 1, distance: int = 25 +): scraper_class = SCRAPER_MAPPING[site_type] scraper = scraper_class() - scraper_input = ScraperInput(search_term=search_term, location=location, page=page) + scraper_input = ScraperInput( + search_term=search_term, location=location, page=page, distance=distance + ) job_response = scraper.scrape(scraper_input) return job_response From 138fbab86153933a59c843994082961b4b8e2d45 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 17:43:45 -0500 Subject: [PATCH 07/18] - indeed refactor - wanted_results init --- api/core/jobs/__init__.py | 12 +++++--- api/core/scrapers/__init__.py | 8 ++---- api/core/scrapers/indeed/__init__.py | 41 ++++++++++++++++++++++------ env | 3 -- env (1) | 3 -- 5 files changed, 43 insertions(+), 24 deletions(-) delete mode 100644 env delete mode 100644 env (1) diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 2026422..ae6feca 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -58,8 +58,12 @@ class JobPost(BaseModel): class JobResponse(BaseModel): - job_count: int - page: int = 1 - total_pages: int + success: bool + error: str = None + + total_pages: int = None + job_count: int = None + + page: int = None + jobs: list[JobPost] = [] - jobs: list[JobPost] diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index d4e9546..b8d8bfd 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -1,6 +1,6 @@ from pydantic import BaseModel from enum import Enum -from ..jobs import JobResponse +from ..jobs import JobResponse, JobPost class Site(Enum): @@ -13,13 +13,11 @@ class ScraperInput(BaseModel): location: str search_term: str distance: int = 25 - - page: int = 1 + results_wanted: int = 15 #: TODO: implement class Scraper: #: to be used as a child class def __init__(self, site: Site): self.site = site - def scrape(self, scraper_input: ScraperInput) -> JobResponse: - ... + def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index b7a6605..d25f7b3 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -10,6 +10,10 @@ from api.core.jobs import * from api.core.scrapers import Scraper, ScraperInput, Site +class ParsingException(Exception): + pass + + class IndeedScraper(Scraper): def __init__(self): site = Site(Site.INDEED) @@ -25,7 +29,7 @@ class IndeedScraper(Scraper): "q": scraper_input.search_term, "l": scraper_input.location, "filter": 0, - "start": 0 if scraper_input.page is None else (scraper_input.page - 1) * 10, + "start": 0, "radius": scraper_input.distance, } @@ -38,12 +42,25 @@ class IndeedScraper(Scraper): soup = BeautifulSoup(response.content, "html.parser") - jobs = IndeedScraper.parse_jobs(soup) + try: + jobs = IndeedScraper.parse_jobs(soup) + except ParsingException: + return JobResponse( + success=False, + error="Failed to parse jobs.", + ) + total_num_jobs = IndeedScraper.total_jobs(soup) total_pages = ceil(total_num_jobs / 15) job_list: list[JobPost] = [] - # page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] + if not jobs.get('metaData', {}).get("mosaicProviderJobCardsModel", {}).get("results"): + return JobResponse( + success=False, + error="No jobs found", + ) + + page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: snippet_html = BeautifulSoup(job["snippet"], "html.parser") @@ -94,9 +111,10 @@ class IndeedScraper(Scraper): job_list.append(job_post) job_response = JobResponse( + success=True, jobs=job_list, job_count=total_num_jobs, - page=scraper_input.page, + page=page_number, total_pages=total_pages, ) return job_response @@ -116,7 +134,14 @@ class IndeedScraper(Scraper): return None @staticmethod - def parse_jobs(soup): + def parse_jobs(soup: BeautifulSoup) -> dict: + """ + Parses the jobs from the soup object + + :param soup: + :return: jobs + """ + script_tag = IndeedScraper.find_mosaic_script(soup) if script_tag: @@ -130,11 +155,9 @@ class IndeedScraper(Scraper): jobs = json.loads(m.group(1).strip()) return jobs else: - return {"message": f"Could not find mosaic provider job cards data"} + raise ParsingException("Could not find mosaic provider job cards data") else: - return { - "message": f"Could not find a script tag containing mosaic provider data" - } + raise ParsingException("Could not find a script tag containing mosaic provider data") @staticmethod def total_jobs(soup): diff --git a/env b/env deleted file mode 100644 index 6c3bc8e..0000000 --- a/env +++ /dev/null @@ -1,3 +0,0 @@ -JWT_SECRET_KEY=55017c787e4aa8fec8b22756200a7c56deaba6fe5154959bcbd80065ab49a906 -SUPABASE_URL=https://ipudiqgwngjlaobogjey.supabase.co -SUPABASE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImlwdWRpcWd3bmdqbGFvYm9namV5Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4ODkzMjc3NiwiZXhwIjoyMDA0NTA4Nzc2fQ.AdHkU2RaC-GScXk0g6zHzfNOJ8sY1geJWvMMgx8j900 diff --git a/env (1) b/env (1) deleted file mode 100644 index 6c3bc8e..0000000 --- a/env (1) +++ /dev/null @@ -1,3 +0,0 @@ -JWT_SECRET_KEY=55017c787e4aa8fec8b22756200a7c56deaba6fe5154959bcbd80065ab49a906 -SUPABASE_URL=https://ipudiqgwngjlaobogjey.supabase.co -SUPABASE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImlwdWRpcWd3bmdqbGFvYm9namV5Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4ODkzMjc3NiwiZXhwIjoyMDA0NTA4Nzc2fQ.AdHkU2RaC-GScXk0g6zHzfNOJ8sY1geJWvMMgx8j900 From 5fc9e480c8297f3d6196c459124e582e192317bd Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 17:47:22 -0500 Subject: [PATCH 08/18] - linkedin refactor --- api/core/scrapers/linkedin/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 7ce4f93..a28c9c3 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -16,8 +16,10 @@ class LinkedInScraper(Scraper): self.url = "https://www.linkedin.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: + current_page = 0 + params = { - "pageNum": scraper_input.page - 1, + "pageNum": current_page, "location": scraper_input.location, "distance": scraper_input.distance, } @@ -58,6 +60,8 @@ class LinkedInScraper(Scraper): if datetime_tag: datetime_str = datetime_tag["datetime"] date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") + else: + date_posted = None job_post = JobPost( title=title, @@ -74,9 +78,11 @@ class LinkedInScraper(Scraper): job_count = int("".join(filter(str.isdigit, job_count_text))) total_pages = ceil(job_count / 25) job_response = JobResponse( + success=True, + jobs=job_list, job_count=job_count, - page=scraper_input.page, + page=current_page + 1, total_pages=total_pages, ) return job_response From 3a955d4d2677d43b2500e0926be27d422e8e2c57 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 17:51:55 -0500 Subject: [PATCH 09/18] - ziprecruiter refactor --- api/core/scrapers/ziprecruiter/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 3a35747..0cee62d 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -21,10 +21,12 @@ class ZipRecruiterScraper(Scraper): client_identifier="chrome112", random_tls_extension_order=True ) + current_page = 1 + params = { "search": scraper_input.search_term, "location": scraper_input.location, - "page": min(scraper_input.page, 10), + "page": min(current_page, 10), "radius": scraper_input.distance, } @@ -80,6 +82,7 @@ class ZipRecruiterScraper(Scraper): job_count = job_count.replace(",", "") total_pages = data["maxPages"] job_response = JobResponse( + success=True, jobs=job_list, job_count=job_count, page=params["page"], @@ -87,6 +90,7 @@ class ZipRecruiterScraper(Scraper): ) return job_response + @staticmethod def get_interval(interval_str): interval_alias = {"annually": CompensationInterval.YEARLY} interval_str = interval_str.lower() @@ -97,7 +101,7 @@ class ZipRecruiterScraper(Scraper): return CompensationInterval(interval_str) @staticmethod - def get_date_posted(job: str): + def get_date_posted(job: BeautifulSoup): button = job.find( "button", {"class": "action_input save_job zrs_btn_secondary_200"} ) @@ -107,7 +111,7 @@ class ZipRecruiterScraper(Scraper): return params.get("posted_time", [None])[0] @staticmethod - def get_compensation(job): + def get_compensation(job: BeautifulSoup): pay_element = job.find("li", {"class": "perk_item perk_pay"}) if pay_element is None: return None @@ -116,7 +120,7 @@ class ZipRecruiterScraper(Scraper): return ZipRecruiterScraper.create_compensation_object(pay) @staticmethod - def get_location(job): + def get_location(job: BeautifulSoup): location_string = job.find("a", {"class": "company_location"}).text.strip() parts = location_string.split(", ") city, state = parts From 352c5cb09296984a564b55b0653b6f26d7625634 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 17:57:35 -0500 Subject: [PATCH 10/18] chore(jobs) remove invalid attr response.reason --- api/core/scrapers/indeed/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index d25f7b3..1138639 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -37,7 +37,7 @@ class IndeedScraper(Scraper): if response.status_code != status.HTTP_200_OK: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Response returned {response.status_code} {response.reason}", + detail=f"Response returned {response.status_code}", ) soup = BeautifulSoup(response.content, "html.parser") From 90db0987f992b11c95bc2a643e0c7c99edecb439 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 18:04:44 -0500 Subject: [PATCH 11/18] - api/jobs/v1 schema change --- api/core/scrapers/__init__.py | 5 ++++- api/v1/jobs/__init__.py | 11 ++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index b8d8bfd..da3063f 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -10,9 +10,12 @@ class Site(Enum): class ScraperInput(BaseModel): - location: str + site_type: Site + search_term: str + location: str distance: int = 25 + results_wanted: int = 15 #: TODO: implement diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index dba24a0..089d2f6 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -3,7 +3,7 @@ from fastapi import APIRouter, Depends from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper from api.core.scrapers.linkedin import LinkedInScraper -from api.core.scrapers import ScraperInput, Site +from api.core.scrapers import ScraperInput, Site, JobResponse router = APIRouter(prefix="/jobs") @@ -14,16 +14,13 @@ SCRAPER_MAPPING = { } -@router.get("/") +@router.get("/", response_model=JobResponse) async def scrape_jobs( - site_type: Site, search_term: str, location: str, page: int = 1, distance: int = 25 + scraper_input: ScraperInput ): - scraper_class = SCRAPER_MAPPING[site_type] + scraper_class = SCRAPER_MAPPING[scraper_input.site_type] scraper = scraper_class() - scraper_input = ScraperInput( - search_term=search_term, location=location, page=page, distance=distance - ) job_response = scraper.scrape(scraper_input) return job_response From 98f5947b7addc453c487b2983840804263fda03b Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 18:17:46 -0500 Subject: [PATCH 12/18] refactor(jobs): use JobResponse model for bad requests --- api/auth/auth_utils.py | 1 - api/core/scrapers/indeed/__init__.py | 7 ++++--- api/core/scrapers/linkedin/__init__.py | 8 +++++--- api/core/scrapers/ziprecruiter/__init__.py | 7 ++++--- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/api/auth/auth_utils.py b/api/auth/auth_utils.py index 12710a8..b9fdfbc 100644 --- a/api/auth/auth_utils.py +++ b/api/auth/auth_utils.py @@ -13,7 +13,6 @@ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token") def create_access_token(data: dict): - print(JWT_SECRET_KEY) to_encode = data.copy() expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES) to_encode.update({"exp": expire}) diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 1138639..9004b4b 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -35,9 +35,10 @@ class IndeedScraper(Scraper): response = session.get(self.url, params=params) if response.status_code != status.HTTP_200_OK: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Response returned {response.status_code}", + return JobResponse( + success=False, + error=f"Response returned {response.status_code}", + http_response_code=response.status_code ) soup = BeautifulSoup(response.content, "html.parser") diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index a28c9c3..e403366 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -26,10 +26,12 @@ class LinkedInScraper(Scraper): self.url = f"{self.url}/{scraper_input.search_term}-jobs" response = requests.get(self.url, params=params) + response.status_code = 300 if response.status_code != status.HTTP_200_OK: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Response returned {response.status_code} {response.reason}", + return JobResponse( + success=False, + error=f"Response returned {response.status_code}", + http_response_code=response.status_code ) soup = BeautifulSoup(response.text, "html.parser") diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 0cee62d..cdb495b 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -34,9 +34,10 @@ class ZipRecruiterScraper(Scraper): self.url, headers=ZipRecruiterScraper.headers(), params=params ) if response.status_code != status.HTTP_200_OK: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Response returned {response.status_code} {response.reason}", + return JobResponse( + success=False, + error=f"Response returned {response.status_code}", + http_response_code=response.status_code ) html_string = response.content From cd62926092dde2820629d6c70c0a81d9105157b4 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 18:19:37 -0500 Subject: [PATCH 13/18] refactor(jobs): use JobResponse model for bad requests --- api/core/scrapers/linkedin/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index e403366..b22a63b 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -26,7 +26,7 @@ class LinkedInScraper(Scraper): self.url = f"{self.url}/{scraper_input.search_term}-jobs" response = requests.get(self.url, params=params) - response.status_code = 300 + if response.status_code != status.HTTP_200_OK: return JobResponse( success=False, From 38c025d43c3b8acb2e8ef141b433745b637e3154 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 18:21:01 -0500 Subject: [PATCH 14/18] - api/jobs/v1 bug fix --- api/v1/jobs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index 089d2f6..d659ff4 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -14,7 +14,7 @@ SCRAPER_MAPPING = { } -@router.get("/", response_model=JobResponse) +@router.post("/", response_model=JobResponse) async def scrape_jobs( scraper_input: ScraperInput ): From 7ac4636e9d35c42e3d39e616a9adc5c45ea888d1 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 18:36:11 -0500 Subject: [PATCH 15/18] chore: black --- api/core/scrapers/indeed/__init__.py | 1 - api/core/scrapers/linkedin/__init__.py | 1 - api/core/scrapers/ziprecruiter/__init__.py | 1 - 3 files changed, 3 deletions(-) diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 9004b4b..b7cf0fe 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -38,7 +38,6 @@ class IndeedScraper(Scraper): return JobResponse( success=False, error=f"Response returned {response.status_code}", - http_response_code=response.status_code ) soup = BeautifulSoup(response.content, "html.parser") diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index b22a63b..0f9de6f 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -31,7 +31,6 @@ class LinkedInScraper(Scraper): return JobResponse( success=False, error=f"Response returned {response.status_code}", - http_response_code=response.status_code ) soup = BeautifulSoup(response.text, "html.parser") diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index cdb495b..e4c6f30 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -37,7 +37,6 @@ class ZipRecruiterScraper(Scraper): return JobResponse( success=False, error=f"Response returned {response.status_code}", - http_response_code=response.status_code ) html_string = response.content From 0a321333bad605259fc78f38fb50b1253c8c57aa Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 22:07:19 -0500 Subject: [PATCH 16/18] feat(jobs): remove pages for results_wanted --- api/auth/auth_utils.py | 20 +- api/auth/db_utils.py | 43 +++- api/auth/register/__init__.py | 11 +- api/auth/token/__init__.py | 12 +- api/core/jobs/__init__.py | 5 +- api/core/scrapers/__init__.py | 9 +- api/core/scrapers/indeed/__init__.py | 247 ++++++++++++--------- api/core/scrapers/linkedin/__init__.py | 151 ++++++++----- api/core/scrapers/ziprecruiter/__init__.py | 214 +++++++++++------- api/v1/jobs/__init__.py | 6 +- settings.py | 2 +- 11 files changed, 449 insertions(+), 271 deletions(-) diff --git a/api/auth/auth_utils.py b/api/auth/auth_utils.py index b9fdfbc..f846329 100644 --- a/api/auth/auth_utils.py +++ b/api/auth/auth_utils.py @@ -12,7 +12,12 @@ load_dotenv() oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token") -def create_access_token(data: dict): +def create_access_token(data: dict) -> str: + """ + Creates a JWT token based on the data provided. + :param data + :return: encoded_jwt + """ to_encode = data.copy() expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES) to_encode.update({"exp": expire}) @@ -21,6 +26,12 @@ def create_access_token(data: dict): async def get_current_user(token: str = Depends(oauth2_scheme)): + """ + Returns the current user associated with the provided JWT token. + :param token + :raises HTTPException: If the token is invalid or the user does not exist. + :return: The UserInDB instance associated with the token. + """ credential_exception = HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Could not validate credentials", @@ -42,6 +53,13 @@ async def get_current_user(token: str = Depends(oauth2_scheme)): async def get_active_current_user(current_user: UserInDB = Depends(get_current_user)): + """ + Returns the current user if the user account is active. + + :param current_user: A UserInDB instance representing the current user. + :raises HTTPException: If the user account is inactive. + :return: The UserInDB instance if the user account is active. + """ if current_user.disabled: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Inactive user." diff --git a/api/auth/db_utils.py b/api/auth/db_utils.py index 339e540..bca7f4c 100644 --- a/api/auth/db_utils.py +++ b/api/auth/db_utils.py @@ -1,3 +1,5 @@ +from typing import Optional, Union + from passlib.context import CryptContext from supabase_py import create_client, Client from fastapi import HTTPException, status @@ -10,6 +12,13 @@ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) def create_user(user_create: UserInDB): + """ + Creates a new user record in the 'users' table in Supabase. + + :param user_create: The data of the user to be created. + :raises HTTPException: If an error occurs while creating the user. + :return: The result of the insert operation. + """ result = supabase.table("users").insert(user_create.dict()).execute() print(f"Insert result: {result}") @@ -22,7 +31,13 @@ def create_user(user_create: UserInDB): return result -def get_user(username: str): +def get_user(username: str) -> Optional[UserInDB]: + """ + Retrieves a user from the 'users' table by their username. + + :param username: The username of the user to retrieve. + :return: The user data if found, otherwise None. + """ result = supabase.table("users").select().eq("username", username).execute() if "error" in result and result["error"]: @@ -36,15 +51,35 @@ def get_user(username: str): return None -def verify_password(password: str, hashed_password: str): +def verify_password(password: str, hashed_password: str) -> bool: + """ + Verifies a password against a hashed password using the bcrypt hashing algorithm. + + :param password: The plaintext password to verify. + :param hashed_password: The hashed password to compare against. + :return: True if the password matches the hashed password, otherwise False. + """ return pwd_context.verify(password, hashed_password) -def get_password_hash(password): +def get_password_hash(password: str) -> str: + """ + Hashes a password using the bcrypt hashing algorithm. + + :param password: The plaintext password to hash. + :return: The hashed password + """ return pwd_context.hash(password) -def authenticate_user(username: str, password: str): +def authenticate_user(username: str, password: str) -> Union[UserInDB, bool]: + """ + Authenticates a user based on their username and password. + + :param username: The username of the user to authenticate. + :param password: The plaintext password to authenticate. + :return: The authenticated user if the username and password are correct, otherwise False. + """ user = get_user(username) if not user: return False diff --git a/api/auth/register/__init__.py b/api/auth/register/__init__.py index 1e140ca..56980c2 100644 --- a/api/auth/register/__init__.py +++ b/api/auth/register/__init__.py @@ -5,8 +5,14 @@ from api.auth.db_utils import get_user, get_password_hash, create_user router = APIRouter(prefix="/register", tags=["register"]) -@router.post("/") -async def register_new_user(user: UserCreate): +@router.post("/", response_model=dict) +async def register_new_user(user: UserCreate) -> dict: + """ + Creates new user + :param user: + :raises HTTPException: If the username already exists. + :return: A dictionary containing a detail key with a success message. + """ existing_user = get_user(user.username) if existing_user is not None: raise HTTPException( @@ -15,7 +21,6 @@ async def register_new_user(user: UserCreate): ) hashed_password = get_password_hash(user.password) - print(f"Hashed password: {hashed_password}") user_create = UserInDB( username=user.username, email=user.email, diff --git a/api/auth/token/__init__.py b/api/auth/token/__init__.py index 9f73f1c..907ff0c 100644 --- a/api/auth/token/__init__.py +++ b/api/auth/token/__init__.py @@ -9,7 +9,15 @@ router = APIRouter(prefix="/token", tags=["token"]) @router.post("/", response_model=Token) -async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): +async def login_for_access_token( + form_data: OAuth2PasswordRequestForm = Depends(), +) -> Token: + """ + Authenticates a user and provides an access token. + :param form_data: OAuth2PasswordRequestForm object containing the user's credentials. + :raises HTTPException: If the user cannot be authenticated. + :return: A Token object containing the access token and the token type. + """ user = authenticate_user(form_data.username, form_data.password) if not user: raise HTTPException( @@ -19,4 +27,4 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends( ) access_token = create_access_token(data={"sub": user.username}) - return {"access_token": access_token, "token_type": "bearer"} + return Token(access_token=access_token, token_type="bearer") diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index ae6feca..ee7e63e 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -11,6 +11,7 @@ class JobType(Enum): TEMPORARY = "temporary" PER_DIEM = "per_diem" NIGHTS = "nights" + OTHER = "other" class Location(BaseModel): @@ -61,9 +62,5 @@ class JobResponse(BaseModel): success: bool error: str = None - total_pages: int = None job_count: int = None - - page: int = None jobs: list[JobPost] = [] - diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index da3063f..018ab4c 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -1,6 +1,6 @@ from pydantic import BaseModel from enum import Enum -from ..jobs import JobResponse, JobPost +from ..jobs import JobResponse class Site(Enum): @@ -16,11 +16,12 @@ class ScraperInput(BaseModel): location: str distance: int = 25 - results_wanted: int = 15 #: TODO: implement + results_wanted: int = 15 -class Scraper: #: to be used as a child class +class Scraper: def __init__(self, site: Site): self.site = site - def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + ... diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index b7cf0fe..5bdb905 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -1,10 +1,11 @@ import re import json -from math import ceil +from typing import Optional import tls_client from bs4 import BeautifulSoup -from fastapi import HTTPException, status +from bs4.element import Tag +from fastapi import status from api.core.jobs import * from api.core.scrapers import Scraper, ScraperInput, Site @@ -16,112 +17,143 @@ class ParsingException(Exception): class IndeedScraper(Scraper): def __init__(self): + """ + Initializes IndeedScraper with the Indeed job search url + """ site = Site(Site.INDEED) super().__init__(site) self.url = "https://www.indeed.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """ + Scrapes Indeed for jobs with scraper_input criteria + :param scraper_input: + :return: job_response + """ session = tls_client.Session( client_identifier="chrome112", random_tls_extension_order=True ) - params = { - "q": scraper_input.search_term, - "l": scraper_input.location, - "filter": 0, - "start": 0, - "radius": scraper_input.distance, - } - - response = session.get(self.url, params=params) - if response.status_code != status.HTTP_200_OK: - return JobResponse( - success=False, - error=f"Response returned {response.status_code}", - ) - - soup = BeautifulSoup(response.content, "html.parser") - - try: - jobs = IndeedScraper.parse_jobs(soup) - except ParsingException: - return JobResponse( - success=False, - error="Failed to parse jobs.", - ) - - total_num_jobs = IndeedScraper.total_jobs(soup) - total_pages = ceil(total_num_jobs / 15) - job_list: list[JobPost] = [] - if not jobs.get('metaData', {}).get("mosaicProviderJobCardsModel", {}).get("results"): - return JobResponse( - success=False, - error="No jobs found", - ) + page = 0 + processed_jobs, total_num_jobs = 0, 0 + seen_urls = set() + while len(job_list) < scraper_input.results_wanted: + params = { + "q": scraper_input.search_term, + "l": scraper_input.location, + "filter": 0, + "start": 0 + page * 10, + "radius": scraper_input.distance, + } - page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] - for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: - snippet_html = BeautifulSoup(job["snippet"], "html.parser") + response = session.get(self.url, params=params) - extracted_salary = job.get("extractedSalary") - compensation = None - if extracted_salary: - salary_snippet = job.get("salarySnippet") - currency = salary_snippet.get("currency") if salary_snippet else None - interval = (extracted_salary.get("type"),) - if isinstance(interval, tuple): - interval = interval[0] - - interval = interval.upper() - if interval in CompensationInterval.__members__: - compensation = Compensation( - interval=CompensationInterval[interval], - min_amount=extracted_salary.get("max"), - max_amount=extracted_salary.get("min"), - currency=currency, - ) - - job_type = IndeedScraper.get_job_type(job) - if job.get("thirdPartyApplyUrl"): - delivery = Delivery( - method=DeliveryEnum.URL, value=job["thirdPartyApplyUrl"] + if response.status_code != status.HTTP_200_OK: + return JobResponse( + success=False, + error=f"Response returned {response.status_code}", ) - else: - delivery = None - timestamp_seconds = job["pubDate"] / 1000 - date_posted = datetime.fromtimestamp(timestamp_seconds) - first_li = snippet_html.find("li") - job_post = JobPost( - title=job["normTitle"], - description=first_li.text if first_li else None, - company_name=job["company"], - location=Location( - city=job["jobLocationCity"], - state=job["jobLocationState"], - postal_code=job.get("jobLocationPostal"), - country="US", - ), - job_type=job_type, - compensation=compensation, - date_posted=date_posted, - delivery=delivery, - ) - job_list.append(job_post) + soup = BeautifulSoup(response.content, "html.parser") + try: + jobs = IndeedScraper.parse_jobs(soup) + except ParsingException: + return JobResponse( + success=False, + error="Failed to parse jobs.", + ) + + total_num_jobs = IndeedScraper.total_jobs(soup) + + if ( + not jobs.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results") + ): + return JobResponse( + success=False, + error="No jobs found", + ) + + for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: + job_url = job["thirdPartyApplyUrl"] + if job_url in seen_urls: + continue + snippet_html = BeautifulSoup(job["snippet"], "html.parser") + + extracted_salary = job.get("extractedSalary") + compensation = None + if extracted_salary: + salary_snippet = job.get("salarySnippet") + currency = ( + salary_snippet.get("currency") if salary_snippet else None + ) + interval = (extracted_salary.get("type"),) + if isinstance(interval, tuple): + interval = interval[0] + + interval = interval.upper() + if interval in CompensationInterval.__members__: + compensation = Compensation( + interval=CompensationInterval[interval], + min_amount=extracted_salary.get("max"), + max_amount=extracted_salary.get("min"), + currency=currency, + ) + + job_type = IndeedScraper.get_job_type(job) + if job.get("thirdPartyApplyUrl"): + delivery = Delivery(method=DeliveryEnum.URL, value=job_url) + else: + delivery = None + timestamp_seconds = job["pubDate"] / 1000 + date_posted = datetime.fromtimestamp(timestamp_seconds) + + first_li = snippet_html.find("li") + job_post = JobPost( + title=job["normTitle"], + description=first_li.text if first_li else None, + company_name=job["company"], + location=Location( + city=job["jobLocationCity"], + state=job["jobLocationState"], + postal_code=job.get("jobLocationPostal"), + country="US", + ), + job_type=job_type, + compensation=compensation, + date_posted=date_posted, + delivery=delivery, + ) + job_list.append(job_post) + if len(job_list) >= scraper_input.results_wanted: + break + + if ( + len(job_list) >= scraper_input.results_wanted + or processed_jobs >= total_num_jobs + ): + break + page += 1 + + job_list = job_list[: scraper_input.results_wanted] job_response = JobResponse( success=True, jobs=job_list, job_count=total_num_jobs, - page=page_number, - total_pages=total_pages, ) return job_response @staticmethod - def get_job_type(data): - for taxonomy in data["taxonomyAttributes"]: + def get_job_type(job: dict) -> Optional[JobType]: + """ + Parses the job to get JobType + :param job: + :return: + """ + for taxonomy in job["taxonomyAttributes"]: if taxonomy["label"] == "job-types": if len(taxonomy["attributes"]) > 0: job_type_str = ( @@ -137,19 +169,31 @@ class IndeedScraper(Scraper): def parse_jobs(soup: BeautifulSoup) -> dict: """ Parses the jobs from the soup object - :param soup: :return: jobs """ - script_tag = IndeedScraper.find_mosaic_script(soup) + def find_mosaic_script() -> Optional[Tag]: + """ + Finds jobcards script tag + :return: script_tag + """ + script_tags = soup.find_all("script") + for tag in script_tags: + if ( + tag.string + and "mosaic.providerData" in tag.string + and "mosaic-provider-jobcards" in tag.string + ): + return tag + return None + + script_tag = find_mosaic_script() if script_tag: script_str = script_tag.string - pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});' p = re.compile(pattern, re.DOTALL) - m = p.search(script_str) if m: jobs = json.loads(m.group(1).strip()) @@ -157,10 +201,17 @@ class IndeedScraper(Scraper): else: raise ParsingException("Could not find mosaic provider job cards data") else: - raise ParsingException("Could not find a script tag containing mosaic provider data") + raise ParsingException( + "Could not find a script tag containing mosaic provider data" + ) @staticmethod - def total_jobs(soup): + def total_jobs(soup: BeautifulSoup) -> int: + """ + Parses the total jobs for that search from soup object + :param soup: + :return: total_num_jobs + """ script = soup.find("script", string=lambda t: "window._initialData" in t) pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL) @@ -169,17 +220,5 @@ class IndeedScraper(Scraper): if match: json_str = match.group(1) data = json.loads(json_str) - total_num_jobs = data["searchTitleBarModel"]["totalNumResults"] + total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"]) return total_num_jobs - - @staticmethod - def find_mosaic_script(soup): - script_tags = soup.find_all("script") - for script_tag in script_tags: - if ( - script_tag.string - and "mosaic.providerData" in script_tag.string - and "mosaic-provider-jobcards" in script_tag.string - ): - return script_tag - return None diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 0f9de6f..aaec33b 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -1,8 +1,9 @@ -from math import ceil +from typing import Optional import requests from bs4 import BeautifulSoup -from fastapi import HTTPException, status +from bs4.element import Tag +from fastapi import status from api.core.scrapers import Scraper, ScraperInput, Site from api.core.jobs import * @@ -10,86 +11,114 @@ from api.core.jobs import * class LinkedInScraper(Scraper): def __init__(self): + """ + Initializes LinkedInScraper with the LinkedIn job search url + """ site = Site(Site.LINKEDIN) super().__init__(site) self.url = "https://www.linkedin.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: - current_page = 0 - - params = { - "pageNum": current_page, - "location": scraper_input.location, - "distance": scraper_input.distance, - } - - self.url = f"{self.url}/{scraper_input.search_term}-jobs" - response = requests.get(self.url, params=params) - - if response.status_code != status.HTTP_200_OK: - return JobResponse( - success=False, - error=f"Response returned {response.status_code}", - ) - - soup = BeautifulSoup(response.text, "html.parser") - + """ + Scrapes LinkedIn for jobs with scraper_input criteria + :param scraper_input: + :return: job_response + """ job_list: list[JobPost] = [] - for job_card in soup.find_all( - "div", - class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", - ): - job_url_tag = job_card.find("a", class_="base-card__full-link") - job_url = job_url_tag["href"] if job_url_tag else "N/A" + seen_urls = set() + page, processed_jobs, job_count = 0, 0, 0 - job_info = job_card.find("div", class_="base-search-card__info") - if job_info is None: - continue - title_tag = job_info.find("h3", class_="base-search-card__title") - title = title_tag.text.strip() if title_tag else "N/A" + with requests.Session() as session: + while len(job_list) < scraper_input.results_wanted: + params = { + "pageNum": page, + "location": scraper_input.location, + "distance": scraper_input.distance, + } - company_tag = job_info.find("a", class_="hidden-nested-link") - company = company_tag.text.strip() if company_tag else "N/A" + self.url = f"{self.url}/{scraper_input.search_term}-jobs" + response = session.get(self.url, params=params, allow_redirects=True) - metadata_card = job_info.find("div", class_="base-search-card__metadata") - location: Location = LinkedInScraper.get_location(metadata_card) + if response.status_code != status.HTTP_200_OK: + return JobResponse( + success=False, + error=f"Response returned {response.status_code}", + ) - datetime_tag = metadata_card.find( - "time", class_="job-search-card__listdate" - ) - if datetime_tag: - datetime_str = datetime_tag["datetime"] - date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") - else: - date_posted = None + soup = BeautifulSoup(response.text, "html.parser") - job_post = JobPost( - title=title, - company_name=company, - location=location, - date_posted=date_posted, - delivery=Delivery(method=DeliveryEnum.URL, value=job_url), - ) - job_list.append(job_post) + if page == 0: + job_count_text = soup.find( + "span", class_="results-context-header__job-count" + ).text + job_count = int("".join(filter(str.isdigit, job_count_text))) - job_count_text = soup.find( - "span", class_="results-context-header__job-count" - ).text - job_count = int("".join(filter(str.isdigit, job_count_text))) - total_pages = ceil(job_count / 25) + for job_card in soup.find_all( + "div", + class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + ): + job_url_tag = job_card.find("a", class_="base-card__full-link") + job_url = job_url_tag["href"] if job_url_tag else "N/A" + if job_url in seen_urls: + continue + seen_urls.add(job_url) + job_info = job_card.find("div", class_="base-search-card__info") + if job_info is None: + continue + title_tag = job_info.find("h3", class_="base-search-card__title") + title = title_tag.text.strip() if title_tag else "N/A" + + company_tag = job_info.find("a", class_="hidden-nested-link") + company = company_tag.text.strip() if company_tag else "N/A" + + metadata_card = job_info.find( + "div", class_="base-search-card__metadata" + ) + location: Location = LinkedInScraper.get_location(metadata_card) + + datetime_tag = metadata_card.find( + "time", class_="job-search-card__listdate" + ) + if datetime_tag: + datetime_str = datetime_tag["datetime"] + date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") + else: + date_posted = None + + job_post = JobPost( + title=title, + company_name=company, + location=location, + date_posted=date_posted, + delivery=Delivery(method=DeliveryEnum.URL, value=job_url), + ) + job_list.append(job_post) + if len(job_list) >= scraper_input.results_wanted: + break + if ( + len(job_list) >= scraper_input.results_wanted + or processed_jobs >= job_count + ): + break + + page += 1 + + job_list = job_list[: scraper_input.results_wanted] job_response = JobResponse( success=True, - jobs=job_list, job_count=job_count, - page=current_page + 1, - total_pages=total_pages, ) return job_response @staticmethod - def get_location(metadata_card): + def get_location(metadata_card: Optional[Tag]) -> Location: + """ + Extracts the location data from the job metadata card. + :param metadata_card + :return: location + """ location = Location( country="US", ) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index e4c6f30..24de04b 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -1,8 +1,9 @@ import json +from typing import Optional from urllib.parse import urlparse, parse_qs import tls_client -from fastapi import HTTPException, status +from fastapi import status from bs4 import BeautifulSoup from api.core.scrapers import Scraper, ScraperInput, Site @@ -11,87 +12,111 @@ from api.core.jobs import * class ZipRecruiterScraper(Scraper): def __init__(self): + """ + Initializes LinkedInScraper with the ZipRecruiter job search url + """ site = Site(Site.ZIP_RECRUITER) super().__init__(site) self.url = "https://www.ziprecruiter.com/jobs-search" def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """ + Scrapes ZipRecruiter for jobs with scraper_input criteria + :param scraper_input: + :return: job_response + """ session = tls_client.Session( client_identifier="chrome112", random_tls_extension_order=True ) - current_page = 1 - - params = { - "search": scraper_input.search_term, - "location": scraper_input.location, - "page": min(current_page, 10), - "radius": scraper_input.distance, - } - - response = session.get( - self.url, headers=ZipRecruiterScraper.headers(), params=params - ) - if response.status_code != status.HTTP_200_OK: - return JobResponse( - success=False, - error=f"Response returned {response.status_code}", - ) - - html_string = response.content - soup = BeautifulSoup(html_string, "html.parser") - - job_posts = soup.find_all("div", {"class": "job_content"}) - job_list: list[JobPost] = [] - for job in job_posts: - title = job.find("h2", {"class": "title"}).text - company = job.find("a", {"class": "company_name"}).text.strip() - description = job.find("p", {"class": "job_snippet"}).text.strip() - job_type_element = job.find("li", {"class": "perk_item perk_type"}) - job_type = ( - job_type_element.text.strip().lower().replace("-", "_") - if job_type_element - else None - ) + page = 1 + processed_jobs, job_count = 0, 0 + seen_urls = set() + while len(job_list) < scraper_input.results_wanted: + params = { + "search": scraper_input.search_term, + "location": scraper_input.location, + "page": page, + "radius": scraper_input.distance, + } - url = job.find("a", {"class": "job_link"})["href"] - date_posted = ZipRecruiterScraper.get_date_posted(job) - - job_type = job_type.replace(" ", "_") if job_type else job_type - job_post = JobPost( - title=title, - description=description, - company_name=company, - location=ZipRecruiterScraper.get_location(job), - job_type=job_type, - compensation=ZipRecruiterScraper.get_compensation(job), - date_posted=date_posted, - delivery=Delivery(method=DeliveryEnum.URL, value=url), + response = session.get( + self.url, headers=ZipRecruiterScraper.headers(), params=params ) - job_list.append(job_post) - if len(job_list) > 20: + if response.status_code != status.HTTP_200_OK: + return JobResponse( + success=False, + error=f"Response returned {response.status_code}", + ) + + html_string = response.content + soup = BeautifulSoup(html_string, "html.parser") + if page == 1: + script_tag = soup.find("script", {"id": "js_variables"}) + data = json.loads(script_tag.string) + + job_count = data["totalJobCount"] + job_count = int(job_count.replace(",", "")) + + job_posts = soup.find_all("div", {"class": "job_content"}) + + for job in job_posts: + processed_jobs += 1 + job_url = job.find("a", {"class": "job_link"})["href"] + if job_url in seen_urls: + continue + title = job.find("h2", {"class": "title"}).text + company = job.find("a", {"class": "company_name"}).text.strip() + description = job.find("p", {"class": "job_snippet"}).text.strip() + job_type_element = job.find("li", {"class": "perk_item perk_type"}) + job_type = ( + job_type_element.text.strip().lower().replace("-", "_") + if job_type_element + else None + ) + + date_posted = ZipRecruiterScraper.get_date_posted(job) + + job_type = job_type.replace(" ", "_") if job_type else job_type + job_post = JobPost( + title=title, + description=description, + company_name=company, + location=ZipRecruiterScraper.get_location(job), + job_type=job_type, + compensation=ZipRecruiterScraper.get_compensation(job), + date_posted=date_posted, + delivery=Delivery(method=DeliveryEnum.URL, value=job_url), + ) + job_list.append(job_post) + if len(job_list) >= scraper_input.results_wanted: + break + + if ( + len(job_list) >= scraper_input.results_wanted + or processed_jobs >= job_count + ): break - script_tag = soup.find("script", {"id": "js_variables"}) + page += 1 - data = json.loads(script_tag.string) - - job_count = data["totalJobCount"] - job_count = job_count.replace(",", "") - total_pages = data["maxPages"] + job_list = job_list[: scraper_input.results_wanted] job_response = JobResponse( success=True, jobs=job_list, job_count=job_count, - page=params["page"], - total_pages=total_pages, ) return job_response @staticmethod - def get_interval(interval_str): + def get_interval(interval_str: str): + """ + Maps the interval alias to its appropriate CompensationInterval. + :param interval_str + :return: CompensationInterval + """ interval_alias = {"annually": CompensationInterval.YEARLY} interval_str = interval_str.lower() @@ -101,7 +126,12 @@ class ZipRecruiterScraper(Scraper): return CompensationInterval(interval_str) @staticmethod - def get_date_posted(job: BeautifulSoup): + def get_date_posted(job: BeautifulSoup) -> Optional[str]: + """ + Extracts the date a job was posted + :param job + :return: date the job was posted or None + """ button = job.find( "button", {"class": "action_input save_job zrs_btn_secondary_200"} ) @@ -111,16 +141,50 @@ class ZipRecruiterScraper(Scraper): return params.get("posted_time", [None])[0] @staticmethod - def get_compensation(job: BeautifulSoup): + def get_compensation(job: BeautifulSoup) -> Optional[Compensation]: + """ + Parses the compensation tag from the job BeautifulSoup object + :param job + :return: Compensation object or None + """ pay_element = job.find("li", {"class": "perk_item perk_pay"}) if pay_element is None: return None pay = pay_element.find("div", {"class": "value"}).find("span").text.strip() - return ZipRecruiterScraper.create_compensation_object(pay) + def create_compensation_object(pay_string: str) -> Compensation: + """ + Creates a Compensation object from a pay_string + :param pay_string + :return: compensation + """ + interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1]) + + amounts = [] + for amount in pay_string.split("to"): + amount = amount.replace(",", "").strip("$ ").split(" ")[0] + if "K" in amount: + amount = amount.replace("K", "") + amount = float(amount) * 1000 + else: + amount = float(amount) + amounts.append(amount) + + compensation = Compensation( + interval=interval, min_amount=min(amounts), max_amount=max(amounts) + ) + + return compensation + + return create_compensation_object(pay) @staticmethod - def get_location(job: BeautifulSoup): + def get_location(job: BeautifulSoup) -> Location: + """ + Extracts the job location from BeatifulSoup object + :param job: + :return: location + """ location_string = job.find("a", {"class": "company_location"}).text.strip() parts = location_string.split(", ") city, state = parts @@ -131,27 +195,11 @@ class ZipRecruiterScraper(Scraper): ) @staticmethod - def create_compensation_object(pay_string: str): - interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1]) - - amounts = [] - for amount in pay_string.split("to"): - amount = amount.replace(",", "").strip("$ ").split(" ")[0] - if "K" in amount: - amount = amount.replace("K", "") - amount = float(amount) * 1000 - else: - amount = float(amount) - amounts.append(amount) - - compensation = Compensation( - interval=interval, min_amount=min(amounts), max_amount=max(amounts) - ) - - return compensation - - @staticmethod - def headers(): + def headers() -> dict: + """ + Returns headers needed for requests + :return: dict - Dictionary containing headers + """ return { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" } diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index d659ff4..753a754 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Depends +from fastapi import APIRouter from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper @@ -15,9 +15,7 @@ SCRAPER_MAPPING = { @router.post("/", response_model=JobResponse) -async def scrape_jobs( - scraper_input: ScraperInput -): +async def scrape_jobs(scraper_input: ScraperInput): scraper_class = SCRAPER_MAPPING[scraper_input.site_type] scraper = scraper_class() diff --git a/settings.py b/settings.py index 39b21eb..0cab126 100644 --- a/settings.py +++ b/settings.py @@ -6,4 +6,4 @@ SUPABASE_URL = os.environ.get("SUPABASE_URL") SUPABASE_KEY = os.environ.get("SUPABASE_KEY") JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY") ALGORITHM = "HS256" -ACCESS_TOKEN_EXPIRE_MINUTES = 30 +ACCESS_TOKEN_EXPIRE_MINUTES = 120 From 40cb07ffe3455e7910417d47c55f252be4707861 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 22:14:38 -0500 Subject: [PATCH 17/18] docs: add readme --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..399bd58 --- /dev/null +++ b/README.md @@ -0,0 +1,27 @@ +# JobSpy Backend + +JobSpy Backend is a RESTful API built with FastAPI that allows users to scrape job postings from various job boards such as LinkedIn, Indeed, and ZipRecruiter. + +## Features + +- User authentication and token-based authorization +- Scraping job postings from LinkedIn, Indeed, and ZipRecruiter +- Detailed job data including title, location, company, and more + +## Endpoints + +- `/api/v1/jobs/`: POST endpoint to scrape jobs. Accepts parameters for site_type (job board), search term, location, distance, and results wanted. +- `/api/auth/token/`: POST endpoint for user authentication. Returns an access token. +- `/api/auth/register/`: POST endpoint to register a new user. +- `/health`: GET endpoint for a simple health check of the application. + +## Installation + +1. Clone this repository. +2. Install the dependencies with `pip install -r requirements.txt`. +3. Run the server with `uvicorn main:app --reload`. + +## Usage + +Visit http://localhost:8000/docs in your web browser to see the automatic interactive API documentation. + From 1663a55ed840fc9b471a5fdd13966547ba5a51b8 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 22:15:51 -0500 Subject: [PATCH 18/18] chore: move postman files into dir --- .../JobSpy.postman_collection.json | 0 .../JobSpy.postman_environment.json | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename JobSpy.postman_collection.json => postman/JobSpy.postman_collection.json (100%) rename JobSpy.postman_environment.json => postman/JobSpy.postman_environment.json (100%) diff --git a/JobSpy.postman_collection.json b/postman/JobSpy.postman_collection.json similarity index 100% rename from JobSpy.postman_collection.json rename to postman/JobSpy.postman_collection.json diff --git a/JobSpy.postman_environment.json b/postman/JobSpy.postman_environment.json similarity index 100% rename from JobSpy.postman_environment.json rename to postman/JobSpy.postman_environment.json