feat: add ZipRecruiterScraper

pull/12/head
Cullen Watson 2023-07-08 06:57:36 -05:00
parent 4c4f9883ae
commit 2c08d84a4f
4 changed files with 181 additions and 20 deletions

View File

@ -6,17 +6,11 @@ from enum import Enum
class JobType(Enum): class JobType(Enum):
FULL_TIME = "full_time" FULL_TIME = "full_time"
PART_TIME = "part_time" PART_TIME = "part_time"
CONTRACT = "contract" CONTRACT = "contractor"
INTERNSHIP = "internship" INTERNSHIP = "internship"
TEMPORARY = "temporary" TEMPORARY = "temporary"
PER_DIEM = "per_diem"
NIGHTS = "nights"
class CompensationInterval(Enum):
YEARLY = "yearly"
MONTHLY = "monthly"
WEEKLY = "weekly"
DAILY = "daily"
HOURLY = "hourly"
class Location(BaseModel): class Location(BaseModel):
@ -27,11 +21,19 @@ class Location(BaseModel):
address: str = None address: str = None
class CompensationInterval(Enum):
YEARLY = "yearly"
MONTHLY = "monthly"
WEEKLY = "weekly"
DAILY = "daily"
HOURLY = "hourly"
class Compensation(BaseModel): class Compensation(BaseModel):
interval: CompensationInterval interval: CompensationInterval
min_amount: int min_amount: float
max_amount: int max_amount: float
currency: str = None currency: str = "US"
class DeliveryEnum(Enum): class DeliveryEnum(Enum):
@ -48,9 +50,8 @@ class JobPost(BaseModel):
title: str title: str
description: str = None description: str = None
company_name: str company_name: str
industry: str = None
location: Location location: Location
job_type: JobType job_type: JobType = None
compensation: Compensation = None compensation: Compensation = None
date_posted: datetime date_posted: datetime
delivery: Delivery = None delivery: Delivery = None

View File

@ -38,7 +38,7 @@ class IndeedScraper(Scraper):
total_pages = math.ceil(total_num_jobs / 15) total_pages = math.ceil(total_num_jobs / 15)
job_list: list[JobPost] = [] job_list: list[JobPost] = []
page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] # page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"]
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
snippet_html = BeautifulSoup(job["snippet"], "html.parser") snippet_html = BeautifulSoup(job["snippet"], "html.parser")
@ -92,7 +92,7 @@ class IndeedScraper(Scraper):
job_response = JobResponse( job_response = JobResponse(
jobs=job_list, jobs=job_list,
job_count=total_num_jobs, job_count=total_num_jobs,
page=page_number, page=scraper_input.page,
total_pages=total_pages, total_pages=total_pages,
) )
return job_response return job_response

View File

@ -0,0 +1,147 @@
import json
from urllib.parse import urlparse, parse_qs
import tls_client
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site
from ...jobs import *
class ZipRecruiterScraper(Scraper):
def __init__(self):
site = Site(Site.INDEED)
super().__init__(site)
self.url = "https://www.ziprecruiter.com/jobs-search"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
params = {
"search": scraper_input.search_term,
"location": scraper_input.location,
"page": min(scraper_input.page if scraper_input.page else 1, 10),
}
response = session.get(
self.url, headers=ZipRecruiterScraper.headers(), params=params
)
if response.status_code != 200:
return {"message": f"Error - Status Code: {response.status_code}"}
html_string = response.content
soup = BeautifulSoup(html_string, "html.parser")
job_posts = soup.find_all("div", {"class": "job_content"})
job_list: list[JobPost] = []
for job in job_posts:
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
job_type = (
job_type_element.text.strip().lower().replace("-", "_")
if job_type_element
else None
)
url = job.find("a", {"class": "job_link"})["href"]
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_type = job_type.replace(" ", "_") if job_type else job_type
job_post = JobPost(
title=title,
description=description,
company_name=company,
industry=None,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
delivery=Delivery(method=DeliveryEnum.URL, value=url),
)
job_list.append(job_post)
script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string)
job_count = data["totalJobCount"]
job_count = job_count.replace(",", "")
total_pages = data["maxPages"]
job_response = JobResponse(
jobs=job_list,
job_count=job_count,
page=params["page"],
total_pages=total_pages,
)
return job_response
def get_interval(interval_str):
interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower()
if interval_str in interval_alias:
return interval_alias[interval_str]
return CompensationInterval(interval_str)
@staticmethod
def get_date_posted(job: str):
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
url_time = button["data-href"]
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
return params.get("posted_time", [None])[0]
@staticmethod
def get_compensation(job):
pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None:
return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
return ZipRecruiterScraper.create_compensation_object(pay)
@staticmethod
def get_location(job):
location_string = job.find("a", {"class": "company_location"}).text.strip()
parts = location_string.split(", ")
city, state = parts
return Location(
country="US",
city=city,
state=state,
)
@staticmethod
def create_compensation_object(pay_string: str):
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
amounts = []
for amount in pay_string.split("to"):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = float(amount) * 1000
else:
amount = float(amount)
amounts.append(amount)
compensation = Compensation(
interval=interval, min_amount=min(amounts), max_amount=max(amounts)
)
return compensation
@staticmethod
def headers():
return {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}

View File

@ -1,6 +1,7 @@
from fastapi import APIRouter from fastapi import APIRouter
from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.indeed import IndeedScraper
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
from api.core.scrapers import ScraperInput from api.core.scrapers import ScraperInput
from api.core.jobs import JobResponse from api.core.jobs import JobResponse
@ -8,9 +9,21 @@ router = APIRouter(prefix="/jobs")
@router.get("/") @router.get("/")
async def scrape_jobs(search_term: str, location: str, page: int = None): async def scrape_jobs(
scraper = IndeedScraper() site_type: str, search_term: str, location: str, page: int = None
):
job_response = {"message": "site type not found"}
if site_type == "indeed":
indeed_scraper = IndeedScraper()
scraper_input = ScraperInput(
search_term=search_term, location=location, page=page
)
job_response = indeed_scraper.scrape(scraper_input)
elif site_type == "zip":
ziprecruiter_scraper = ZipRecruiterScraper()
scraper_input = ScraperInput(
search_term=search_term, location=location, page=page
)
job_response = ziprecruiter_scraper.scrape(scraper_input)
scraper_input = ScraperInput(search_term=search_term, location=location, page=page)
job_response = scraper.scrape(scraper_input)
return job_response return job_response