From ab7fa24e7be5ba8a9e34b3718a6a50d1692be4c7 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 10 Jul 2023 16:14:05 -0500 Subject: [PATCH 1/4] feat(jobs): add distance param --- api/core/scrapers/__init__.py | 1 + api/core/scrapers/indeed/__init__.py | 1 + api/core/scrapers/linkedin/__init__.py | 6 +++++- api/core/scrapers/ziprecruiter/__init__.py | 1 + api/v1/jobs/__init__.py | 8 ++++++-- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index d128403..d4e9546 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -12,6 +12,7 @@ class Site(Enum): class ScraperInput(BaseModel): location: str search_term: str + distance: int = 25 page: int = 1 diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index f89d8a0..b7a6605 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -26,6 +26,7 @@ class IndeedScraper(Scraper): "l": scraper_input.location, "filter": 0, "start": 0 if scraper_input.page is None else (scraper_input.page - 1) * 10, + "radius": scraper_input.distance, } response = session.get(self.url, params=params) diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 198b0dc..7ce4f93 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -16,7 +16,11 @@ class LinkedInScraper(Scraper): self.url = "https://www.linkedin.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: - params = {"pageNum": scraper_input.page - 1, "location": scraper_input.location} + params = { + "pageNum": scraper_input.page - 1, + "location": scraper_input.location, + "distance": scraper_input.distance, + } self.url = f"{self.url}/{scraper_input.search_term}-jobs" response = requests.get(self.url, params=params) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 89fd0ec..3a35747 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -25,6 +25,7 @@ class ZipRecruiterScraper(Scraper): "search": scraper_input.search_term, "location": scraper_input.location, "page": min(scraper_input.page, 10), + "radius": scraper_input.distance, } response = session.get( diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index e79c4bf..dba24a0 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -15,11 +15,15 @@ SCRAPER_MAPPING = { @router.get("/") -async def scrape_jobs(site_type: Site, search_term: str, location: str, page: int = 1): +async def scrape_jobs( + site_type: Site, search_term: str, location: str, page: int = 1, distance: int = 25 +): scraper_class = SCRAPER_MAPPING[site_type] scraper = scraper_class() - scraper_input = ScraperInput(search_term=search_term, location=location, page=page) + scraper_input = ScraperInput( + search_term=search_term, location=location, page=page, distance=distance + ) job_response = scraper.scrape(scraper_input) return job_response From 95be7766466a5f1ab61cf5ee3375105ff2692e1a Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 17:43:45 -0500 Subject: [PATCH 2/4] - indeed refactor - wanted_results init --- api/core/jobs/__init__.py | 12 +++++--- api/core/scrapers/__init__.py | 8 ++---- api/core/scrapers/indeed/__init__.py | 41 ++++++++++++++++++++++------ 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 2026422..ae6feca 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -58,8 +58,12 @@ class JobPost(BaseModel): class JobResponse(BaseModel): - job_count: int - page: int = 1 - total_pages: int + success: bool + error: str = None + + total_pages: int = None + job_count: int = None + + page: int = None + jobs: list[JobPost] = [] - jobs: list[JobPost] diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index d4e9546..b8d8bfd 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -1,6 +1,6 @@ from pydantic import BaseModel from enum import Enum -from ..jobs import JobResponse +from ..jobs import JobResponse, JobPost class Site(Enum): @@ -13,13 +13,11 @@ class ScraperInput(BaseModel): location: str search_term: str distance: int = 25 - - page: int = 1 + results_wanted: int = 15 #: TODO: implement class Scraper: #: to be used as a child class def __init__(self, site: Site): self.site = site - def scrape(self, scraper_input: ScraperInput) -> JobResponse: - ... + def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index b7a6605..d25f7b3 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -10,6 +10,10 @@ from api.core.jobs import * from api.core.scrapers import Scraper, ScraperInput, Site +class ParsingException(Exception): + pass + + class IndeedScraper(Scraper): def __init__(self): site = Site(Site.INDEED) @@ -25,7 +29,7 @@ class IndeedScraper(Scraper): "q": scraper_input.search_term, "l": scraper_input.location, "filter": 0, - "start": 0 if scraper_input.page is None else (scraper_input.page - 1) * 10, + "start": 0, "radius": scraper_input.distance, } @@ -38,12 +42,25 @@ class IndeedScraper(Scraper): soup = BeautifulSoup(response.content, "html.parser") - jobs = IndeedScraper.parse_jobs(soup) + try: + jobs = IndeedScraper.parse_jobs(soup) + except ParsingException: + return JobResponse( + success=False, + error="Failed to parse jobs.", + ) + total_num_jobs = IndeedScraper.total_jobs(soup) total_pages = ceil(total_num_jobs / 15) job_list: list[JobPost] = [] - # page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] + if not jobs.get('metaData', {}).get("mosaicProviderJobCardsModel", {}).get("results"): + return JobResponse( + success=False, + error="No jobs found", + ) + + page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: snippet_html = BeautifulSoup(job["snippet"], "html.parser") @@ -94,9 +111,10 @@ class IndeedScraper(Scraper): job_list.append(job_post) job_response = JobResponse( + success=True, jobs=job_list, job_count=total_num_jobs, - page=scraper_input.page, + page=page_number, total_pages=total_pages, ) return job_response @@ -116,7 +134,14 @@ class IndeedScraper(Scraper): return None @staticmethod - def parse_jobs(soup): + def parse_jobs(soup: BeautifulSoup) -> dict: + """ + Parses the jobs from the soup object + + :param soup: + :return: jobs + """ + script_tag = IndeedScraper.find_mosaic_script(soup) if script_tag: @@ -130,11 +155,9 @@ class IndeedScraper(Scraper): jobs = json.loads(m.group(1).strip()) return jobs else: - return {"message": f"Could not find mosaic provider job cards data"} + raise ParsingException("Could not find mosaic provider job cards data") else: - return { - "message": f"Could not find a script tag containing mosaic provider data" - } + raise ParsingException("Could not find a script tag containing mosaic provider data") @staticmethod def total_jobs(soup): From fc4c8213998e5e80e7f263368e836619da929553 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 17:47:22 -0500 Subject: [PATCH 3/4] - linkedin refactor --- api/core/scrapers/linkedin/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 7ce4f93..a28c9c3 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -16,8 +16,10 @@ class LinkedInScraper(Scraper): self.url = "https://www.linkedin.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: + current_page = 0 + params = { - "pageNum": scraper_input.page - 1, + "pageNum": current_page, "location": scraper_input.location, "distance": scraper_input.distance, } @@ -58,6 +60,8 @@ class LinkedInScraper(Scraper): if datetime_tag: datetime_str = datetime_tag["datetime"] date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") + else: + date_posted = None job_post = JobPost( title=title, @@ -74,9 +78,11 @@ class LinkedInScraper(Scraper): job_count = int("".join(filter(str.isdigit, job_count_text))) total_pages = ceil(job_count / 25) job_response = JobResponse( + success=True, + jobs=job_list, job_count=job_count, - page=scraper_input.page, + page=current_page + 1, total_pages=total_pages, ) return job_response From 215d1ef1fd0be474afd81f52229a1715ea71b333 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Mon, 10 Jul 2023 17:51:55 -0500 Subject: [PATCH 4/4] - ziprecruiter refactor --- api/core/scrapers/ziprecruiter/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 3a35747..0cee62d 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -21,10 +21,12 @@ class ZipRecruiterScraper(Scraper): client_identifier="chrome112", random_tls_extension_order=True ) + current_page = 1 + params = { "search": scraper_input.search_term, "location": scraper_input.location, - "page": min(scraper_input.page, 10), + "page": min(current_page, 10), "radius": scraper_input.distance, } @@ -80,6 +82,7 @@ class ZipRecruiterScraper(Scraper): job_count = job_count.replace(",", "") total_pages = data["maxPages"] job_response = JobResponse( + success=True, jobs=job_list, job_count=job_count, page=params["page"], @@ -87,6 +90,7 @@ class ZipRecruiterScraper(Scraper): ) return job_response + @staticmethod def get_interval(interval_str): interval_alias = {"annually": CompensationInterval.YEARLY} interval_str = interval_str.lower() @@ -97,7 +101,7 @@ class ZipRecruiterScraper(Scraper): return CompensationInterval(interval_str) @staticmethod - def get_date_posted(job: str): + def get_date_posted(job: BeautifulSoup): button = job.find( "button", {"class": "action_input save_job zrs_btn_secondary_200"} ) @@ -107,7 +111,7 @@ class ZipRecruiterScraper(Scraper): return params.get("posted_time", [None])[0] @staticmethod - def get_compensation(job): + def get_compensation(job: BeautifulSoup): pay_element = job.find("li", {"class": "perk_item perk_pay"}) if pay_element is None: return None @@ -116,7 +120,7 @@ class ZipRecruiterScraper(Scraper): return ZipRecruiterScraper.create_compensation_object(pay) @staticmethod - def get_location(job): + def get_location(job: BeautifulSoup): location_string = job.find("a", {"class": "company_location"}).text.strip() parts = location_string.split(", ") city, state = parts