From 8c490babece6adedbcbef9faa0679e04006275df Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 7 Jul 2023 21:00:59 -0500 Subject: [PATCH] feat: add IndeedScraper --- .gitignore | 6 +- __pycache__/main.cpython-310.pyc | Bin 420 -> 421 bytes api/__pycache__/__init__.cpython-310.pyc | Bin 314 -> 315 bytes api/core/jobs/__init__.py | 38 ++--- api/core/scrapers/__init__.py | 9 +- api/core/scrapers/indeed/__init__.py | 158 ++++++++++++++++++ api/v1/__pycache__/__init__.cpython-310.pyc | Bin 305 -> 306 bytes api/v1/jobs/__init__.py | 13 ++ .../jobs/__pycache__/__init__.cpython-310.pyc | Bin 239 -> 711 bytes main.py | 1 - 10 files changed, 199 insertions(+), 26 deletions(-) create mode 100644 api/core/scrapers/indeed/__init__.py diff --git a/.gitignore b/.gitignore index d56bda0..2c5d1a7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ /.idea -.DS_Store \ No newline at end of file +.DS_Store +/venv/ +/ven/ +**/__pycache__/ +*.pyc \ No newline at end of file diff --git a/__pycache__/main.cpython-310.pyc b/__pycache__/main.cpython-310.pyc index 63b3d34133a78ef413fc3150cb0e93618ea1d32b..3422c7f4b1ca2408f250da32e0a93b646e442cdc 100644 GIT binary patch delta 73 zcmZ3&yp)+YpO=@50SLMeuShvOk@up$sk2o~XmM&$aZEw7v5{#^KxJ}9Vo`2DQGQlx ba!GMaR(?`(L1j!*VsdtBUP{bl9>y5}cjXzm delta 72 zcmZ3=yo8xIpO=@50SIpTEl=4sk@upWiGFBtYEiL%Rbp~RVo_yAVs1f6exANdYH@Z+ aet~{geo}EkrG8Rka&~H7ivDC?#u)%)hZ!^g diff --git a/api/__pycache__/__init__.cpython-310.pyc b/api/__pycache__/__init__.cpython-310.pyc index 8d7f747c2c97005a7f52b8f0b61bb46bdbafffd5..bc97eb3fec3b0847bc3652c55c213e1be6d6983b 100644 GIT binary patch delta 77 zcmdnRw3~@HpO=@50SKN3El-&_k@u>xt+Q23XmM&$aZEw7v5{#^KxJ}9Vo`2DQGQlx fa!GMaR(?`(L1j!*VsdtBUP?@2L1xTk5k@rtRjeclxYEiL%Rbp~RVo_yAVs1f6exANdYH@Z+ eet~{geo}EkrG8Rka&~H7ihg22rv7A6Ml}G`j2cG( diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 4343afe..5f85a0f 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -4,18 +4,19 @@ from enum import Enum class JobType(Enum): - FULL_TIME = 'full_time' - PART_TIME = 'part_time' - CONTRACT = 'contract' - INTERNSHIP = 'internship' + FULL_TIME = "full_time" + PART_TIME = "part_time" + CONTRACT = "contract" + INTERNSHIP = "internship" + TEMPORARY = "temporary" class CompensationInterval(Enum): - ANNUAL = 'annual' - MONTHLY = 'monthly' - WEEKLY = 'weekly' - DAILY = 'daily' - HOURLY = 'hourly' + YEARLY = "yearly" + MONTHLY = "monthly" + WEEKLY = "weekly" + DAILY = "daily" + HOURLY = "hourly" class Location(BaseModel): @@ -30,12 +31,12 @@ class Compensation(BaseModel): interval: CompensationInterval min_amount: int max_amount: int - currency: str + currency: str = None class DeliveryEnum(Enum): - EMAIL = 'email' - URL = 'url' + EMAIL = "email" + URL = "url" class Delivery(BaseModel): @@ -45,20 +46,19 @@ class Delivery(BaseModel): class JobPost(BaseModel): title: str - description: str + description: str = None company_name: str - industry: str + industry: str = None location: Location job_type: JobType - compensation: Compensation + compensation: Compensation = None date_posted: datetime delivery: Delivery = None class JobResponse(BaseModel): - jobs: list[JobPost] - job_count: int - - page: int + page: int = 1 total_pages: int + + jobs: list[JobPost] diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index e31b1e2..d128403 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -1,6 +1,6 @@ from pydantic import BaseModel from enum import Enum -from ..jobs import JobResponse, JobPost +from ..jobs import JobResponse class Site(Enum): @@ -10,16 +10,15 @@ class Site(Enum): class ScraperInput(BaseModel): - site: Site - location: str search_term: str - page: int = None + page: int = 1 class Scraper: #: to be used as a child class def __init__(self, site: Site): self.site = site - def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... \ No newline at end of file + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + ... diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py new file mode 100644 index 0000000..188f0af --- /dev/null +++ b/api/core/scrapers/indeed/__init__.py @@ -0,0 +1,158 @@ +import json +import re +import math + +import tls_client +from bs4 import BeautifulSoup + +from .. import Scraper, ScraperInput, Site +from ...jobs import * + + +class IndeedScraper(Scraper): + def __init__(self): + site = Site(Site.INDEED) + super().__init__(site) + self.url = "https://www.indeed.com/jobs" + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + session = tls_client.Session( + client_identifier="chrome112", random_tls_extension_order=True + ) + + params = { + "q": scraper_input.search_term, + "l": scraper_input.location, + "filter": 0, + "start": 0 if scraper_input.page is None else (scraper_input.page - 1) * 10, + } + + response = session.get(self.url, params=params) + if response.status_code != 200: + return {"message": f"Error - Status Code: {response.status_code}"} + + soup = BeautifulSoup(response.content, "html.parser") + + jobs = IndeedScraper.parse_jobs(soup) + total_num_jobs = IndeedScraper.total_jobs(soup) + total_pages = math.ceil(total_num_jobs / 15) + + job_list: list[JobPost] = [] + page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] + for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: + snippet_html = BeautifulSoup(job["snippet"], "html.parser") + + extracted_salary = job.get("extractedSalary") + compensation = None + if extracted_salary: + salary_snippet = job.get("salarySnippet") + currency = salary_snippet.get("currency") if salary_snippet else None + interval = (extracted_salary.get("type"),) + if isinstance(interval, tuple): + interval = interval[0] + + interval = interval.upper() + if interval in CompensationInterval.__members__: + compensation = Compensation( + interval=CompensationInterval[interval], + min_amount=extracted_salary.get("max"), + max_amount=extracted_salary.get("min"), + currency=currency, + ) + + job_type = IndeedScraper.get_job_type(job) + if job.get("thirdPartyApplyUrl"): + delivery = Delivery( + method=DeliveryEnum.URL, value=job["thirdPartyApplyUrl"] + ) + else: + delivery = None + timestamp_seconds = job["pubDate"] / 1000 + date_posted = datetime.fromtimestamp(timestamp_seconds) + + first_li = snippet_html.find("li") + job_post = JobPost( + title=job["normTitle"], + description=first_li.text if first_li else None, + company_name=job["company"], + industry=None, + location=Location( + city=job["jobLocationCity"], + state=job["jobLocationState"], + postal_code=job.get("jobLocationPostal"), + country="US", + ), + job_type=job_type, + compensation=compensation, + date_posted=date_posted, + delivery=delivery, + ) + job_list.append(job_post) + + job_response = JobResponse( + jobs=job_list, + job_count=total_num_jobs, + page=page_number, + total_pages=total_pages, + ) + return job_response + + @staticmethod + def get_job_type(data): + for taxonomy in data["taxonomyAttributes"]: + if taxonomy["label"] == "job-types": + if len(taxonomy["attributes"]) > 0: + job_type_str = ( + taxonomy["attributes"][0]["label"] + .replace("-", "_") + .replace(" ", "_") + .upper() + ) + return JobType[job_type_str] + return None + + @staticmethod + def parse_jobs(soup): + script_tag = IndeedScraper.find_mosaic_script(soup) + + if script_tag: + script_str = script_tag.string + + pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});' + p = re.compile(pattern, re.DOTALL) + + m = p.search(script_str) + if m: + jobs = json.loads(m.group(1).strip()) + return jobs + else: + return {"message": f"Could not find mosaic provider job cards data"} + else: + return { + "message": f"Could not find a script tag containing mosaic provider data" + } + + @staticmethod + def total_jobs(soup): + script = soup.find("script", string=lambda t: "window._initialData" in t) + + pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL) + match = pattern.search(script.string) + total_num_jobs = 0 + if match: + json_str = match.group(1) + data = json.loads(json_str) + total_num_jobs = data["searchTitleBarModel"]["totalNumResults"] + return total_num_jobs + + @staticmethod + def find_mosaic_script(soup): + script_tags = soup.find_all("script") + for script_tag in script_tags: + if ( + script_tag.string + and "mosaic.providerData" in script_tag.string + and "mosaic-provider-jobcards" in script_tag.string + ): + return script_tag + return None diff --git a/api/v1/__pycache__/__init__.cpython-310.pyc b/api/v1/__pycache__/__init__.cpython-310.pyc index a06585af0aac049d7c14634f987ca53f567ead43..4edaf46235351a8d7dd2496c3afea807ff8f3b38 100644 GIT binary patch delta 80 zcmdnUw26r~pO=@50SKN3El-&+k@uvTgR@mkXmM&$aZEw7v5{#^KxJ}9Vo`2DQGQlx ia!GMaR(?`(L1j!*VsdtBUP?@2L1s*uVa#N1Ml}F0ksG!E delta 79 zcmdnQw2_H7pO=@50SIpTEl=s5$a~V%UO%)rwWwIXDls`Dv8Xa5F}I*3KTqE!wK%&Z hzd%1LKdHE&Qa>p%IXg8kML)41Q@_kme=-lF8UP(08uI`E diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index 5932ff1..9a7dba9 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -1,3 +1,16 @@ from fastapi import APIRouter +from api.core.scrapers.indeed import IndeedScraper +from api.core.scrapers import ScraperInput +from api.core.jobs import JobResponse + router = APIRouter(prefix="/jobs") + + +@router.get("/") +async def scrape_jobs(search_term: str, location: str, page: int = None): + scraper = IndeedScraper() + + scraper_input = ScraperInput(search_term=search_term, location=location, page=page) + job_response = scraper.scrape(scraper_input) + return job_response diff --git a/api/v1/jobs/__pycache__/__init__.cpython-310.pyc b/api/v1/jobs/__pycache__/__init__.cpython-310.pyc index 11962eff5a70fab2a7abba510814369892006332..d15969a4ccbbd09d5f7f026f28aba463a291c54e 100644 GIT binary patch literal 711 zcmZWnJ&)5s5Z&?mD~TOMM@2>BCMQ5N2q6%n2t^d#iDqTVINr^j4}WQQg9vgY(9xm$ z4|g5>j{6sGspzS$VsPri|K{R&MPv&&F=$*8vNF2;d&~&w+&^ycqG|68Myl#F&rKCt|YpdBo$d z-Wjlnhess2^9#F>1cuSe<@~VSXd&?s&l@fTKPqJ2St8np^QPNqOqjfBSBFA%ZKK2| z@8GmuDP;Xligo$%8V(l2M2TD$Cm9y3hp=jkT$gP#xLtlM3Tm3}&Bw2Yeub_cAbbH$ z&d82x=t&Q~!{MCn+@4-Iy?Y5eulI0nXOU28RwYLU%u10kb)}2Id}U`({C|>IaC0G9 zybrR{Xkrm+$mCwq4^oCu3@*qRxdawTJOt}pX)GfhBjO%L zv!az^wx3naO8YQYD7*XL^^NxUFK-%3v)sX#_H74xEc8H?mIEytZ97}3w;(e4;j_Bs Xn@T*#UzDgRLPR1bf(XV=NbkcRVyw19 literal 239 zcmYjJI}XAy6ieDtKB63e8+0LJM+i_CmI{GbSgJr$we*uH35aqB&Ve`*U75H76OT?< zdbaFmS$JL#0iNNVEkIvC^A8)$3{DS$7-Bd_1>RwdIA#{JZ+K@hXM_CK8~(`0b$7m6 zuEoi4b#0NO*lWmKsd$`TLf@|2G1Z33R9R53UDY%nNV#Ae`UGUFxzglH4+&LOLJMg` xNdm4jBV;ovE3&7DjF*hS6giKZltfWlrY4F;vZ_0iLakAkCK05#a diff --git a/main.py b/main.py index 13b96a6..c870be2 100644 --- a/main.py +++ b/main.py @@ -8,4 +8,3 @@ app.include_router(api_router) @app.get("/") async def root(): return {"message": "Hello World"} -