feat(jobs): remove pages for results_wanted

pull/12/head
Cullen Watson 2023-07-10 22:07:19 -05:00
parent bf56410ecf
commit 3240214bb1
11 changed files with 449 additions and 271 deletions

View File

@ -12,7 +12,12 @@ load_dotenv()
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token") oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token")
def create_access_token(data: dict): def create_access_token(data: dict) -> str:
"""
Creates a JWT token based on the data provided.
:param data
:return: encoded_jwt
"""
to_encode = data.copy() to_encode = data.copy()
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES) expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
to_encode.update({"exp": expire}) to_encode.update({"exp": expire})
@ -21,6 +26,12 @@ def create_access_token(data: dict):
async def get_current_user(token: str = Depends(oauth2_scheme)): async def get_current_user(token: str = Depends(oauth2_scheme)):
"""
Returns the current user associated with the provided JWT token.
:param token
:raises HTTPException: If the token is invalid or the user does not exist.
:return: The UserInDB instance associated with the token.
"""
credential_exception = HTTPException( credential_exception = HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, status_code=status.HTTP_401_UNAUTHORIZED,
detail="Could not validate credentials", detail="Could not validate credentials",
@ -42,6 +53,13 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
async def get_active_current_user(current_user: UserInDB = Depends(get_current_user)): async def get_active_current_user(current_user: UserInDB = Depends(get_current_user)):
"""
Returns the current user if the user account is active.
:param current_user: A UserInDB instance representing the current user.
:raises HTTPException: If the user account is inactive.
:return: The UserInDB instance if the user account is active.
"""
if current_user.disabled: if current_user.disabled:
raise HTTPException( raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Inactive user." status_code=status.HTTP_401_UNAUTHORIZED, detail="Inactive user."

View File

@ -1,3 +1,5 @@
from typing import Optional, Union
from passlib.context import CryptContext from passlib.context import CryptContext
from supabase_py import create_client, Client from supabase_py import create_client, Client
from fastapi import HTTPException, status from fastapi import HTTPException, status
@ -10,6 +12,13 @@ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
def create_user(user_create: UserInDB): def create_user(user_create: UserInDB):
"""
Creates a new user record in the 'users' table in Supabase.
:param user_create: The data of the user to be created.
:raises HTTPException: If an error occurs while creating the user.
:return: The result of the insert operation.
"""
result = supabase.table("users").insert(user_create.dict()).execute() result = supabase.table("users").insert(user_create.dict()).execute()
print(f"Insert result: {result}") print(f"Insert result: {result}")
@ -22,7 +31,13 @@ def create_user(user_create: UserInDB):
return result return result
def get_user(username: str): def get_user(username: str) -> Optional[UserInDB]:
"""
Retrieves a user from the 'users' table by their username.
:param username: The username of the user to retrieve.
:return: The user data if found, otherwise None.
"""
result = supabase.table("users").select().eq("username", username).execute() result = supabase.table("users").select().eq("username", username).execute()
if "error" in result and result["error"]: if "error" in result and result["error"]:
@ -36,15 +51,35 @@ def get_user(username: str):
return None return None
def verify_password(password: str, hashed_password: str): def verify_password(password: str, hashed_password: str) -> bool:
"""
Verifies a password against a hashed password using the bcrypt hashing algorithm.
:param password: The plaintext password to verify.
:param hashed_password: The hashed password to compare against.
:return: True if the password matches the hashed password, otherwise False.
"""
return pwd_context.verify(password, hashed_password) return pwd_context.verify(password, hashed_password)
def get_password_hash(password): def get_password_hash(password: str) -> str:
"""
Hashes a password using the bcrypt hashing algorithm.
:param password: The plaintext password to hash.
:return: The hashed password
"""
return pwd_context.hash(password) return pwd_context.hash(password)
def authenticate_user(username: str, password: str): def authenticate_user(username: str, password: str) -> Union[UserInDB, bool]:
"""
Authenticates a user based on their username and password.
:param username: The username of the user to authenticate.
:param password: The plaintext password to authenticate.
:return: The authenticated user if the username and password are correct, otherwise False.
"""
user = get_user(username) user = get_user(username)
if not user: if not user:
return False return False

View File

@ -5,8 +5,14 @@ from api.auth.db_utils import get_user, get_password_hash, create_user
router = APIRouter(prefix="/register", tags=["register"]) router = APIRouter(prefix="/register", tags=["register"])
@router.post("/") @router.post("/", response_model=dict)
async def register_new_user(user: UserCreate): async def register_new_user(user: UserCreate) -> dict:
"""
Creates new user
:param user:
:raises HTTPException: If the username already exists.
:return: A dictionary containing a detail key with a success message.
"""
existing_user = get_user(user.username) existing_user = get_user(user.username)
if existing_user is not None: if existing_user is not None:
raise HTTPException( raise HTTPException(
@ -15,7 +21,6 @@ async def register_new_user(user: UserCreate):
) )
hashed_password = get_password_hash(user.password) hashed_password = get_password_hash(user.password)
print(f"Hashed password: {hashed_password}")
user_create = UserInDB( user_create = UserInDB(
username=user.username, username=user.username,
email=user.email, email=user.email,

View File

@ -9,7 +9,15 @@ router = APIRouter(prefix="/token", tags=["token"])
@router.post("/", response_model=Token) @router.post("/", response_model=Token)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): async def login_for_access_token(
form_data: OAuth2PasswordRequestForm = Depends(),
) -> Token:
"""
Authenticates a user and provides an access token.
:param form_data: OAuth2PasswordRequestForm object containing the user's credentials.
:raises HTTPException: If the user cannot be authenticated.
:return: A Token object containing the access token and the token type.
"""
user = authenticate_user(form_data.username, form_data.password) user = authenticate_user(form_data.username, form_data.password)
if not user: if not user:
raise HTTPException( raise HTTPException(
@ -19,4 +27,4 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
) )
access_token = create_access_token(data={"sub": user.username}) access_token = create_access_token(data={"sub": user.username})
return {"access_token": access_token, "token_type": "bearer"} return Token(access_token=access_token, token_type="bearer")

View File

@ -11,6 +11,7 @@ class JobType(Enum):
TEMPORARY = "temporary" TEMPORARY = "temporary"
PER_DIEM = "per_diem" PER_DIEM = "per_diem"
NIGHTS = "nights" NIGHTS = "nights"
OTHER = "other"
class Location(BaseModel): class Location(BaseModel):
@ -61,9 +62,5 @@ class JobResponse(BaseModel):
success: bool success: bool
error: str = None error: str = None
total_pages: int = None
job_count: int = None job_count: int = None
page: int = None
jobs: list[JobPost] = [] jobs: list[JobPost] = []

View File

@ -1,6 +1,6 @@
from pydantic import BaseModel from pydantic import BaseModel
from enum import Enum from enum import Enum
from ..jobs import JobResponse, JobPost from ..jobs import JobResponse
class Site(Enum): class Site(Enum):
@ -16,11 +16,12 @@ class ScraperInput(BaseModel):
location: str location: str
distance: int = 25 distance: int = 25
results_wanted: int = 15 #: TODO: implement results_wanted: int = 15
class Scraper: #: to be used as a child class class Scraper:
def __init__(self, site: Site): def __init__(self, site: Site):
self.site = site self.site = site
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... def scrape(self, scraper_input: ScraperInput) -> JobResponse:
...

View File

@ -1,10 +1,11 @@
import re import re
import json import json
from math import ceil from typing import Optional
import tls_client import tls_client
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from fastapi import HTTPException, status from bs4.element import Tag
from fastapi import status
from api.core.jobs import * from api.core.jobs import *
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site
@ -16,24 +17,38 @@ class ParsingException(Exception):
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
def __init__(self): def __init__(self):
"""
Initializes IndeedScraper with the Indeed job search url
"""
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site) super().__init__(site)
self.url = "https://www.indeed.com/jobs" self.url = "https://www.indeed.com/jobs"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session( session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True client_identifier="chrome112", random_tls_extension_order=True
) )
job_list: list[JobPost] = []
page = 0
processed_jobs, total_num_jobs = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
"l": scraper_input.location, "l": scraper_input.location,
"filter": 0, "filter": 0,
"start": 0, "start": 0 + page * 10,
"radius": scraper_input.distance, "radius": scraper_input.distance,
} }
response = session.get(self.url, params=params) response = session.get(self.url, params=params)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
return JobResponse( return JobResponse(
success=False, success=False,
@ -51,24 +66,30 @@ class IndeedScraper(Scraper):
) )
total_num_jobs = IndeedScraper.total_jobs(soup) total_num_jobs = IndeedScraper.total_jobs(soup)
total_pages = ceil(total_num_jobs / 15)
job_list: list[JobPost] = [] if (
if not jobs.get('metaData', {}).get("mosaicProviderJobCardsModel", {}).get("results"): not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
return JobResponse( return JobResponse(
success=False, success=False,
error="No jobs found", error="No jobs found",
) )
page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"]
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
job_url = job["thirdPartyApplyUrl"]
if job_url in seen_urls:
continue
snippet_html = BeautifulSoup(job["snippet"], "html.parser") snippet_html = BeautifulSoup(job["snippet"], "html.parser")
extracted_salary = job.get("extractedSalary") extracted_salary = job.get("extractedSalary")
compensation = None compensation = None
if extracted_salary: if extracted_salary:
salary_snippet = job.get("salarySnippet") salary_snippet = job.get("salarySnippet")
currency = salary_snippet.get("currency") if salary_snippet else None currency = (
salary_snippet.get("currency") if salary_snippet else None
)
interval = (extracted_salary.get("type"),) interval = (extracted_salary.get("type"),)
if isinstance(interval, tuple): if isinstance(interval, tuple):
interval = interval[0] interval = interval[0]
@ -84,9 +105,7 @@ class IndeedScraper(Scraper):
job_type = IndeedScraper.get_job_type(job) job_type = IndeedScraper.get_job_type(job)
if job.get("thirdPartyApplyUrl"): if job.get("thirdPartyApplyUrl"):
delivery = Delivery( delivery = Delivery(method=DeliveryEnum.URL, value=job_url)
method=DeliveryEnum.URL, value=job["thirdPartyApplyUrl"]
)
else: else:
delivery = None delivery = None
timestamp_seconds = job["pubDate"] / 1000 timestamp_seconds = job["pubDate"] / 1000
@ -109,19 +128,32 @@ class IndeedScraper(Scraper):
delivery=delivery, delivery=delivery,
) )
job_list.append(job_post) job_list.append(job_post)
if len(job_list) >= scraper_input.results_wanted:
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
page += 1
job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=total_num_jobs, job_count=total_num_jobs,
page=page_number,
total_pages=total_pages,
) )
return job_response return job_response
@staticmethod @staticmethod
def get_job_type(data): def get_job_type(job: dict) -> Optional[JobType]:
for taxonomy in data["taxonomyAttributes"]: """
Parses the job to get JobType
:param job:
:return:
"""
for taxonomy in job["taxonomyAttributes"]:
if taxonomy["label"] == "job-types": if taxonomy["label"] == "job-types":
if len(taxonomy["attributes"]) > 0: if len(taxonomy["attributes"]) > 0:
job_type_str = ( job_type_str = (
@ -137,19 +169,31 @@ class IndeedScraper(Scraper):
def parse_jobs(soup: BeautifulSoup) -> dict: def parse_jobs(soup: BeautifulSoup) -> dict:
""" """
Parses the jobs from the soup object Parses the jobs from the soup object
:param soup: :param soup:
:return: jobs :return: jobs
""" """
script_tag = IndeedScraper.find_mosaic_script(soup) def find_mosaic_script() -> Optional[Tag]:
"""
Finds jobcards script tag
:return: script_tag
"""
script_tags = soup.find_all("script")
for tag in script_tags:
if (
tag.string
and "mosaic.providerData" in tag.string
and "mosaic-provider-jobcards" in tag.string
):
return tag
return None
script_tag = find_mosaic_script()
if script_tag: if script_tag:
script_str = script_tag.string script_str = script_tag.string
pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});' pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});'
p = re.compile(pattern, re.DOTALL) p = re.compile(pattern, re.DOTALL)
m = p.search(script_str) m = p.search(script_str)
if m: if m:
jobs = json.loads(m.group(1).strip()) jobs = json.loads(m.group(1).strip())
@ -157,10 +201,17 @@ class IndeedScraper(Scraper):
else: else:
raise ParsingException("Could not find mosaic provider job cards data") raise ParsingException("Could not find mosaic provider job cards data")
else: else:
raise ParsingException("Could not find a script tag containing mosaic provider data") raise ParsingException(
"Could not find a script tag containing mosaic provider data"
)
@staticmethod @staticmethod
def total_jobs(soup): def total_jobs(soup: BeautifulSoup) -> int:
"""
Parses the total jobs for that search from soup object
:param soup:
:return: total_num_jobs
"""
script = soup.find("script", string=lambda t: "window._initialData" in t) script = soup.find("script", string=lambda t: "window._initialData" in t)
pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL) pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL)
@ -169,17 +220,5 @@ class IndeedScraper(Scraper):
if match: if match:
json_str = match.group(1) json_str = match.group(1)
data = json.loads(json_str) data = json.loads(json_str)
total_num_jobs = data["searchTitleBarModel"]["totalNumResults"] total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
return total_num_jobs return total_num_jobs
@staticmethod
def find_mosaic_script(soup):
script_tags = soup.find_all("script")
for script_tag in script_tags:
if (
script_tag.string
and "mosaic.providerData" in script_tag.string
and "mosaic-provider-jobcards" in script_tag.string
):
return script_tag
return None

View File

@ -1,8 +1,9 @@
from math import ceil from typing import Optional
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from fastapi import HTTPException, status from bs4.element import Tag
from fastapi import status
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import * from api.core.jobs import *
@ -10,22 +11,34 @@ from api.core.jobs import *
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
def __init__(self): def __init__(self):
"""
Initializes LinkedInScraper with the LinkedIn job search url
"""
site = Site(Site.LINKEDIN) site = Site(Site.LINKEDIN)
super().__init__(site) super().__init__(site)
self.url = "https://www.linkedin.com/jobs" self.url = "https://www.linkedin.com/jobs"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
current_page = 0 """
Scrapes LinkedIn for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
job_list: list[JobPost] = []
seen_urls = set()
page, processed_jobs, job_count = 0, 0, 0
with requests.Session() as session:
while len(job_list) < scraper_input.results_wanted:
params = { params = {
"pageNum": current_page, "pageNum": page,
"location": scraper_input.location, "location": scraper_input.location,
"distance": scraper_input.distance, "distance": scraper_input.distance,
} }
self.url = f"{self.url}/{scraper_input.search_term}-jobs" self.url = f"{self.url}/{scraper_input.search_term}-jobs"
response = requests.get(self.url, params=params) response = session.get(self.url, params=params, allow_redirects=True)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
return JobResponse( return JobResponse(
@ -35,14 +48,21 @@ class LinkedInScraper(Scraper):
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_list: list[JobPost] = [] if page == 0:
job_count_text = soup.find(
"span", class_="results-context-header__job-count"
).text
job_count = int("".join(filter(str.isdigit, job_count_text)))
for job_card in soup.find_all( for job_card in soup.find_all(
"div", "div",
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
): ):
job_url_tag = job_card.find("a", class_="base-card__full-link") job_url_tag = job_card.find("a", class_="base-card__full-link")
job_url = job_url_tag["href"] if job_url_tag else "N/A" job_url = job_url_tag["href"] if job_url_tag else "N/A"
if job_url in seen_urls:
continue
seen_urls.add(job_url)
job_info = job_card.find("div", class_="base-search-card__info") job_info = job_card.find("div", class_="base-search-card__info")
if job_info is None: if job_info is None:
continue continue
@ -52,7 +72,9 @@ class LinkedInScraper(Scraper):
company_tag = job_info.find("a", class_="hidden-nested-link") company_tag = job_info.find("a", class_="hidden-nested-link")
company = company_tag.text.strip() if company_tag else "N/A" company = company_tag.text.strip() if company_tag else "N/A"
metadata_card = job_info.find("div", class_="base-search-card__metadata") metadata_card = job_info.find(
"div", class_="base-search-card__metadata"
)
location: Location = LinkedInScraper.get_location(metadata_card) location: Location = LinkedInScraper.get_location(metadata_card)
datetime_tag = metadata_card.find( datetime_tag = metadata_card.find(
@ -72,24 +94,31 @@ class LinkedInScraper(Scraper):
delivery=Delivery(method=DeliveryEnum.URL, value=job_url), delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
) )
job_list.append(job_post) job_list.append(job_post)
if len(job_list) >= scraper_input.results_wanted:
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
job_count_text = soup.find( page += 1
"span", class_="results-context-header__job-count"
).text job_list = job_list[: scraper_input.results_wanted]
job_count = int("".join(filter(str.isdigit, job_count_text)))
total_pages = ceil(job_count / 25)
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=job_count, job_count=job_count,
page=current_page + 1,
total_pages=total_pages,
) )
return job_response return job_response
@staticmethod @staticmethod
def get_location(metadata_card): def get_location(metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.
:param metadata_card
:return: location
"""
location = Location( location = Location(
country="US", country="US",
) )

View File

@ -1,8 +1,9 @@
import json import json
from typing import Optional
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
from fastapi import HTTPException, status from fastapi import status
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site
@ -11,22 +12,33 @@ from api.core.jobs import *
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
def __init__(self): def __init__(self):
"""
Initializes LinkedInScraper with the ZipRecruiter job search url
"""
site = Site(Site.ZIP_RECRUITER) site = Site(Site.ZIP_RECRUITER)
super().__init__(site) super().__init__(site)
self.url = "https://www.ziprecruiter.com/jobs-search" self.url = "https://www.ziprecruiter.com/jobs-search"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session( session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True client_identifier="chrome112", random_tls_extension_order=True
) )
current_page = 1 job_list: list[JobPost] = []
page = 1
processed_jobs, job_count = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
params = { params = {
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"page": min(current_page, 10), "page": page,
"radius": scraper_input.distance, "radius": scraper_input.distance,
} }
@ -41,11 +53,20 @@ class ZipRecruiterScraper(Scraper):
html_string = response.content html_string = response.content
soup = BeautifulSoup(html_string, "html.parser") soup = BeautifulSoup(html_string, "html.parser")
if page == 1:
script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string)
job_count = data["totalJobCount"]
job_count = int(job_count.replace(",", ""))
job_posts = soup.find_all("div", {"class": "job_content"}) job_posts = soup.find_all("div", {"class": "job_content"})
job_list: list[JobPost] = []
for job in job_posts: for job in job_posts:
processed_jobs += 1
job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in seen_urls:
continue
title = job.find("h2", {"class": "title"}).text title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip() company = job.find("a", {"class": "company_name"}).text.strip()
description = job.find("p", {"class": "job_snippet"}).text.strip() description = job.find("p", {"class": "job_snippet"}).text.strip()
@ -56,7 +77,6 @@ class ZipRecruiterScraper(Scraper):
else None else None
) )
url = job.find("a", {"class": "job_link"})["href"]
date_posted = ZipRecruiterScraper.get_date_posted(job) date_posted = ZipRecruiterScraper.get_date_posted(job)
job_type = job_type.replace(" ", "_") if job_type else job_type job_type = job_type.replace(" ", "_") if job_type else job_type
@ -68,30 +88,35 @@ class ZipRecruiterScraper(Scraper):
job_type=job_type, job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job), compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted, date_posted=date_posted,
delivery=Delivery(method=DeliveryEnum.URL, value=url), delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
) )
job_list.append(job_post) job_list.append(job_post)
if len(job_list) > 20: if len(job_list) >= scraper_input.results_wanted:
break break
script_tag = soup.find("script", {"id": "js_variables"}) if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
data = json.loads(script_tag.string) page += 1
job_count = data["totalJobCount"] job_list = job_list[: scraper_input.results_wanted]
job_count = job_count.replace(",", "")
total_pages = data["maxPages"]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=job_count, job_count=job_count,
page=params["page"],
total_pages=total_pages,
) )
return job_response return job_response
@staticmethod @staticmethod
def get_interval(interval_str): def get_interval(interval_str: str):
"""
Maps the interval alias to its appropriate CompensationInterval.
:param interval_str
:return: CompensationInterval
"""
interval_alias = {"annually": CompensationInterval.YEARLY} interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower() interval_str = interval_str.lower()
@ -101,7 +126,12 @@ class ZipRecruiterScraper(Scraper):
return CompensationInterval(interval_str) return CompensationInterval(interval_str)
@staticmethod @staticmethod
def get_date_posted(job: BeautifulSoup): def get_date_posted(job: BeautifulSoup) -> Optional[str]:
"""
Extracts the date a job was posted
:param job
:return: date the job was posted or None
"""
button = job.find( button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"} "button", {"class": "action_input save_job zrs_btn_secondary_200"}
) )
@ -111,27 +141,23 @@ class ZipRecruiterScraper(Scraper):
return params.get("posted_time", [None])[0] return params.get("posted_time", [None])[0]
@staticmethod @staticmethod
def get_compensation(job: BeautifulSoup): def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
"""
Parses the compensation tag from the job BeautifulSoup object
:param job
:return: Compensation object or None
"""
pay_element = job.find("li", {"class": "perk_item perk_pay"}) pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None: if pay_element is None:
return None return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip() pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
return ZipRecruiterScraper.create_compensation_object(pay) def create_compensation_object(pay_string: str) -> Compensation:
"""
@staticmethod Creates a Compensation object from a pay_string
def get_location(job: BeautifulSoup): :param pay_string
location_string = job.find("a", {"class": "company_location"}).text.strip() :return: compensation
parts = location_string.split(", ") """
city, state = parts
return Location(
country="US",
city=city,
state=state,
)
@staticmethod
def create_compensation_object(pay_string: str):
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1]) interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
amounts = [] amounts = []
@ -150,8 +176,30 @@ class ZipRecruiterScraper(Scraper):
return compensation return compensation
return create_compensation_object(pay)
@staticmethod @staticmethod
def headers(): def get_location(job: BeautifulSoup) -> Location:
"""
Extracts the job location from BeatifulSoup object
:param job:
:return: location
"""
location_string = job.find("a", {"class": "company_location"}).text.strip()
parts = location_string.split(", ")
city, state = parts
return Location(
country="US",
city=city,
state=state,
)
@staticmethod
def headers() -> dict:
"""
Returns headers needed for requests
:return: dict - Dictionary containing headers
"""
return { return {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
} }

View File

@ -1,4 +1,4 @@
from fastapi import APIRouter, Depends from fastapi import APIRouter
from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.indeed import IndeedScraper
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
@ -15,9 +15,7 @@ SCRAPER_MAPPING = {
@router.post("/", response_model=JobResponse) @router.post("/", response_model=JobResponse)
async def scrape_jobs( async def scrape_jobs(scraper_input: ScraperInput):
scraper_input: ScraperInput
):
scraper_class = SCRAPER_MAPPING[scraper_input.site_type] scraper_class = SCRAPER_MAPPING[scraper_input.site_type]
scraper = scraper_class() scraper = scraper_class()

View File

@ -6,4 +6,4 @@ SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_KEY") SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY") JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY")
ALGORITHM = "HS256" ALGORITHM = "HS256"
ACCESS_TOKEN_EXPIRE_MINUTES = 30 ACCESS_TOKEN_EXPIRE_MINUTES = 120