diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index 94d3dd4..6de3894 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -1,6 +1,11 @@ from ..jobs import * +class StatusException(Exception): + def __init__(self, status_code: int): + self.status_code = status_code + + class Site(Enum): LINKEDIN = "linkedin" INDEED = "indeed" diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 3aa6a88..41c8970 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -8,7 +8,9 @@ from bs4.element import Tag from fastapi import status from api.core.jobs import * -from api.core.scrapers import Scraper, ScraperInput, Site +from api.core.scrapers import Scraper, ScraperInput, Site, StatusException + +from concurrent.futures import ThreadPoolExecutor, Future class ParsingException(Exception): @@ -25,6 +27,99 @@ class IndeedScraper(Scraper): self.url = "https://www.indeed.com/jobs" self.job_url = "https://www.indeed.com/viewjob?jk=" + self.jobs_per_page = 15 + self.seen_urls = set() + + def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]: + job_list = [] + + params = { + "q": scraper_input.search_term, + "location": scraper_input.location, + "radius": scraper_input.distance, + "filter": 0, + "start": 0 + page * 10, + } + sc_values = [] + if scraper_input.is_remote: + sc_values.append("attr(DSQF7)") + if scraper_input.job_type: + sc_values.append("jt({})".format(scraper_input.job_type.value)) + + if sc_values: + params["sc"] = "0kf:" + "".join(sc_values) + ";" + response = session.get(self.url, params=params) + + if ( + response.status_code != status.HTTP_200_OK + and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT + ): + raise StatusException(response.status_code) + + soup = BeautifulSoup(response.content, "html.parser") + + jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function + + #: total_num_jobs = IndeedScraper.total_jobs(soup) #: for now + + if ( + not jobs.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results") + ): + raise Exception('No jobs found.') + + for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: + job_url = f'{self.job_url}{job["jobkey"]}' + if job_url in self.seen_urls: + continue + + snippet_html = BeautifulSoup(job["snippet"], "html.parser") + + extracted_salary = job.get("extractedSalary") + compensation = None + if extracted_salary: + salary_snippet = job.get("salarySnippet") + currency = ( + salary_snippet.get("currency") if salary_snippet else None + ) + interval = (extracted_salary.get("type"),) + if isinstance(interval, tuple): + interval = interval[0] + + interval = interval.upper() + if interval in CompensationInterval.__members__: + compensation = Compensation( + interval=CompensationInterval[interval], + min_amount=extracted_salary.get("max"), + max_amount=extracted_salary.get("min"), + currency=currency, + ) + + job_type = IndeedScraper.get_job_type(job) + timestamp_seconds = job["pubDate"] / 1000 + date_posted = datetime.fromtimestamp(timestamp_seconds) + + first_li = snippet_html.find("li") + job_post = JobPost( + title=job["normTitle"], + description=first_li.text if first_li else None, + company_name=job["company"], + location=Location( + city=job.get("jobLocationCity"), + state=job.get("jobLocationState"), + postal_code=job.get("jobLocationPostal"), + country="US", + ), + job_type=job_type, + compensation=compensation, + date_posted=date_posted, + job_url=job_url, + ) + job_list.append(job_post) + + return job_list + def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ Scrapes Indeed for jobs with scraper_input criteria @@ -35,125 +130,45 @@ class IndeedScraper(Scraper): client_identifier="chrome112", random_tls_extension_order=True ) - job_list: list[JobPost] = [] - page = 0 - processed_jobs, total_num_jobs = 0, 0 - seen_urls = set() - while len(job_list) < scraper_input.results_wanted: - params = { - "q": scraper_input.search_term, - "location": scraper_input.location, - "radius": scraper_input.distance, - "filter": 0, - "start": 0 + page * 10, - } - sc_values = [] - if scraper_input.is_remote: - sc_values.append("attr(DSQF7)") - if scraper_input.job_type: - sc_values.append("jt({})".format(scraper_input.job_type.value)) + pages_to_process = scraper_input.results_wanted // self.jobs_per_page - if sc_values: - params["sc"] = "0kf:" + "".join(sc_values) + ";" - response = session.get(self.url, params=params) + try: + #: get first page to initialize session + job_list = self.scrape_page(scraper_input, 0, session) - if ( - response.status_code != status.HTTP_200_OK - and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT - ): - return JobResponse( - success=False, - error=f"Response returned {response.status_code}", - ) + with ThreadPoolExecutor(max_workers=10) as executor: + futures: list[Future] = [ + executor.submit( + self.scrape_page, scraper_input, page, session + ) for page in range(1, pages_to_process + 1) + ] - soup = BeautifulSoup(response.content, "html.parser") + for future in futures: + result = future.result() - try: - jobs = IndeedScraper.parse_jobs(soup) - except ParsingException: - return JobResponse( - success=False, - error="Failed to parse jobs.", - ) + job_list += result - total_num_jobs = IndeedScraper.total_jobs(soup) + except StatusException as e: + return JobResponse( + success=False, + error=f"Indeed returned status code {e.status_code}", + ) + except ParsingException as e: + return JobResponse( + success=False, + error=f"Indeed failed to parse response: {e}", + ) + except Exception as e: + return JobResponse( + success=False, + error=f"Indeed failed to scrape: {e}", + ) - if ( - not jobs.get("metaData", {}) - .get("mosaicProviderJobCardsModel", {}) - .get("results") - ): - return JobResponse( - success=False, - error="No jobs found", - ) - - for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: - processed_jobs += 1 - job_url = f'{self.job_url}{job["jobkey"]}' - if job_url in seen_urls: - continue - snippet_html = BeautifulSoup(job["snippet"], "html.parser") - - extracted_salary = job.get("extractedSalary") - compensation = None - if extracted_salary: - salary_snippet = job.get("salarySnippet") - currency = ( - salary_snippet.get("currency") if salary_snippet else None - ) - interval = (extracted_salary.get("type"),) - if isinstance(interval, tuple): - interval = interval[0] - - interval = interval.upper() - if interval in CompensationInterval.__members__: - compensation = Compensation( - interval=CompensationInterval[interval], - min_amount=extracted_salary.get("max"), - max_amount=extracted_salary.get("min"), - currency=currency, - ) - - job_type = IndeedScraper.get_job_type(job) - timestamp_seconds = job["pubDate"] / 1000 - date_posted = datetime.fromtimestamp(timestamp_seconds) - - first_li = snippet_html.find("li") - job_post = JobPost( - title=job["normTitle"], - description=first_li.text if first_li else None, - company_name=job["company"], - location=Location( - city=job.get("jobLocationCity"), - state=job.get("jobLocationState"), - postal_code=job.get("jobLocationPostal"), - country="US", - ), - job_type=job_type, - compensation=compensation, - date_posted=date_posted, - job_url=job_url, - ) - job_list.append(job_post) - if ( - len(job_list) >= scraper_input.results_wanted - or processed_jobs >= total_num_jobs - ): - break - - if ( - len(job_list) >= scraper_input.results_wanted - or processed_jobs >= total_num_jobs - ): - break - page += 1 - - job_list = job_list[: scraper_input.results_wanted] + #: job_list = job_list[: scraper_input.results_wanted] job_response = JobResponse( success=True, jobs=job_list, - job_count=total_num_jobs, + job_count=len(job_list), ) return job_response @@ -192,9 +207,9 @@ class IndeedScraper(Scraper): script_tags = soup.find_all("script") for tag in script_tags: if ( - tag.string - and "mosaic.providerData" in tag.string - and "mosaic-provider-jobcards" in tag.string + tag.string + and "mosaic.providerData" in tag.string + and "mosaic-provider-jobcards" in tag.string ): return tag return None diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 21abd6e..364090f 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -7,15 +7,10 @@ from fastapi import status from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, Future -from api.core.scrapers import Scraper, ScraperInput, Site +from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from api.core.jobs import * -class StatusException(Exception): - def __init__(self, status_code: int): - self.status_code = status_code - - class ZipRecruiterScraper(Scraper): def __init__(self): """