diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 2026422..ae6feca 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -58,8 +58,12 @@ class JobPost(BaseModel): class JobResponse(BaseModel): - job_count: int - page: int = 1 - total_pages: int + success: bool + error: str = None + + total_pages: int = None + job_count: int = None + + page: int = None + jobs: list[JobPost] = [] - jobs: list[JobPost] diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index d4e9546..b8d8bfd 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -1,6 +1,6 @@ from pydantic import BaseModel from enum import Enum -from ..jobs import JobResponse +from ..jobs import JobResponse, JobPost class Site(Enum): @@ -13,13 +13,11 @@ class ScraperInput(BaseModel): location: str search_term: str distance: int = 25 - - page: int = 1 + results_wanted: int = 15 #: TODO: implement class Scraper: #: to be used as a child class def __init__(self, site: Site): self.site = site - def scrape(self, scraper_input: ScraperInput) -> JobResponse: - ... + def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index b7a6605..d25f7b3 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -10,6 +10,10 @@ from api.core.jobs import * from api.core.scrapers import Scraper, ScraperInput, Site +class ParsingException(Exception): + pass + + class IndeedScraper(Scraper): def __init__(self): site = Site(Site.INDEED) @@ -25,7 +29,7 @@ class IndeedScraper(Scraper): "q": scraper_input.search_term, "l": scraper_input.location, "filter": 0, - "start": 0 if scraper_input.page is None else (scraper_input.page - 1) * 10, + "start": 0, "radius": scraper_input.distance, } @@ -38,12 +42,25 @@ class IndeedScraper(Scraper): soup = BeautifulSoup(response.content, "html.parser") - jobs = IndeedScraper.parse_jobs(soup) + try: + jobs = IndeedScraper.parse_jobs(soup) + except ParsingException: + return JobResponse( + success=False, + error="Failed to parse jobs.", + ) + total_num_jobs = IndeedScraper.total_jobs(soup) total_pages = ceil(total_num_jobs / 15) job_list: list[JobPost] = [] - # page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] + if not jobs.get('metaData', {}).get("mosaicProviderJobCardsModel", {}).get("results"): + return JobResponse( + success=False, + error="No jobs found", + ) + + page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: snippet_html = BeautifulSoup(job["snippet"], "html.parser") @@ -94,9 +111,10 @@ class IndeedScraper(Scraper): job_list.append(job_post) job_response = JobResponse( + success=True, jobs=job_list, job_count=total_num_jobs, - page=scraper_input.page, + page=page_number, total_pages=total_pages, ) return job_response @@ -116,7 +134,14 @@ class IndeedScraper(Scraper): return None @staticmethod - def parse_jobs(soup): + def parse_jobs(soup: BeautifulSoup) -> dict: + """ + Parses the jobs from the soup object + + :param soup: + :return: jobs + """ + script_tag = IndeedScraper.find_mosaic_script(soup) if script_tag: @@ -130,11 +155,9 @@ class IndeedScraper(Scraper): jobs = json.loads(m.group(1).strip()) return jobs else: - return {"message": f"Could not find mosaic provider job cards data"} + raise ParsingException("Could not find mosaic provider job cards data") else: - return { - "message": f"Could not find a script tag containing mosaic provider data" - } + raise ParsingException("Could not find a script tag containing mosaic provider data") @staticmethod def total_jobs(soup): diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 7ce4f93..a28c9c3 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -16,8 +16,10 @@ class LinkedInScraper(Scraper): self.url = "https://www.linkedin.com/jobs" def scrape(self, scraper_input: ScraperInput) -> JobResponse: + current_page = 0 + params = { - "pageNum": scraper_input.page - 1, + "pageNum": current_page, "location": scraper_input.location, "distance": scraper_input.distance, } @@ -58,6 +60,8 @@ class LinkedInScraper(Scraper): if datetime_tag: datetime_str = datetime_tag["datetime"] date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") + else: + date_posted = None job_post = JobPost( title=title, @@ -74,9 +78,11 @@ class LinkedInScraper(Scraper): job_count = int("".join(filter(str.isdigit, job_count_text))) total_pages = ceil(job_count / 25) job_response = JobResponse( + success=True, + jobs=job_list, job_count=job_count, - page=scraper_input.page, + page=current_page + 1, total_pages=total_pages, ) return job_response diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 3a35747..0cee62d 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -21,10 +21,12 @@ class ZipRecruiterScraper(Scraper): client_identifier="chrome112", random_tls_extension_order=True ) + current_page = 1 + params = { "search": scraper_input.search_term, "location": scraper_input.location, - "page": min(scraper_input.page, 10), + "page": min(current_page, 10), "radius": scraper_input.distance, } @@ -80,6 +82,7 @@ class ZipRecruiterScraper(Scraper): job_count = job_count.replace(",", "") total_pages = data["maxPages"] job_response = JobResponse( + success=True, jobs=job_list, job_count=job_count, page=params["page"], @@ -87,6 +90,7 @@ class ZipRecruiterScraper(Scraper): ) return job_response + @staticmethod def get_interval(interval_str): interval_alias = {"annually": CompensationInterval.YEARLY} interval_str = interval_str.lower() @@ -97,7 +101,7 @@ class ZipRecruiterScraper(Scraper): return CompensationInterval(interval_str) @staticmethod - def get_date_posted(job: str): + def get_date_posted(job: BeautifulSoup): button = job.find( "button", {"class": "action_input save_job zrs_btn_secondary_200"} ) @@ -107,7 +111,7 @@ class ZipRecruiterScraper(Scraper): return params.get("posted_time", [None])[0] @staticmethod - def get_compensation(job): + def get_compensation(job: BeautifulSoup): pay_element = job.find("li", {"class": "perk_item perk_pay"}) if pay_element is None: return None @@ -116,7 +120,7 @@ class ZipRecruiterScraper(Scraper): return ZipRecruiterScraper.create_compensation_object(pay) @staticmethod - def get_location(job): + def get_location(job: BeautifulSoup): location_string = job.find("a", {"class": "company_location"}).text.strip() parts = location_string.split(", ") city, state = parts