From a0425ef480f4580754fb98471cd495ebf0c50ed9 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Tue, 11 Jul 2023 10:49:36 -0500 Subject: [PATCH 1/5] - zip_recruiter parallel job search --- api/core/scrapers/ziprecruiter/__init__.py | 214 ++++++++++++--------- 1 file changed, 119 insertions(+), 95 deletions(-) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index b340c26..21abd6e 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -5,11 +5,17 @@ from urllib.parse import urlparse, parse_qs import tls_client from fastapi import status from bs4 import BeautifulSoup +from concurrent.futures import ThreadPoolExecutor, Future from api.core.scrapers import Scraper, ScraperInput, Site from api.core.jobs import * +class StatusException(Exception): + def __init__(self, status_code: int): + self.status_code = status_code + + class ZipRecruiterScraper(Scraper): def __init__(self): """ @@ -19,6 +25,97 @@ class ZipRecruiterScraper(Scraper): super().__init__(site) self.url = "https://www.ziprecruiter.com/jobs-search" + self.jobs_per_page = 20 + self.seen_urls = set() + + def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]: + """ + Scrapes a page of ZipRecruiter for jobs with scraper_input criteria + :param scraper_input: + :param page: + :param session: + :return: + """ + + job_list = [] + + job_type_value = None + if scraper_input.job_type: + if scraper_input.job_type.value == "fulltime": + job_type_value = "full_time" + elif scraper_input.job_type.value == "parttime": + job_type_value = "part_time" + else: + job_type_value = scraper_input.job_type.value + + params = { + "search": scraper_input.search_term, + "location": scraper_input.location, + "radius": scraper_input.distance, + "refine_by_location_type": "only_remote" + if scraper_input.is_remote + else None, + "refine_by_employment": f"employment_type:employment_type:{job_type_value}" + if job_type_value + else None, + "page": page, + } + + response = session.get( + self.url, headers=ZipRecruiterScraper.headers(), params=params + ) + + if response.status_code != status.HTTP_200_OK: + raise StatusException(response.status_code) + + html_string = response.content + soup = BeautifulSoup(html_string, "html.parser") + if page == 1: + script_tag = soup.find("script", {"id": "js_variables"}) + data = json.loads(script_tag.string) + + #: job_count = int(data["totalJobCount"].replace(",", "")) + + job_posts = soup.find_all("div", {"class": "job_content"}) + + for job in job_posts: + job_url = job.find("a", {"class": "job_link"})["href"] + if job_url in self.seen_urls: + continue + + title = job.find("h2", {"class": "title"}).text + company = job.find("a", {"class": "company_name"}).text.strip() + description = job.find("p", {"class": "job_snippet"}).text.strip() + job_type_element = job.find("li", {"class": "perk_item perk_type"}) + + if job_type_element: + job_type_text = ( + job_type_element.text.strip() + .lower() + .replace("-", "") + .replace(" ", "") + ) + if job_type_text == "contractor": + job_type_text = "contract" + job_type = JobType(job_type_text) + else: + job_type = None + + date_posted = ZipRecruiterScraper.get_date_posted(job) + + job_post = JobPost( + title=title, + description=description, + company_name=company, + location=ZipRecruiterScraper.get_location(job), + job_type=job_type, + compensation=ZipRecruiterScraper.get_compensation(job), + date_posted=date_posted, + job_url=job_url, + ) + job_list.append(job_post) + + return job_list def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -29,110 +126,37 @@ class ZipRecruiterScraper(Scraper): session = tls_client.Session( client_identifier="chrome112", random_tls_extension_order=True ) + pages_to_process = scraper_input.results_wanted // self.jobs_per_page - job_list: list[JobPost] = [] - page = 1 - processed_jobs, job_count = 0, 0 - seen_urls = set() - while len(job_list) < scraper_input.results_wanted: - job_type_value = None - if scraper_input.job_type: - if scraper_input.job_type.value == "fulltime": - job_type_value = "full_time" - elif scraper_input.job_type.value == "parttime": - job_type_value = "part_time" - else: - job_type_value = scraper_input.job_type.value + try: + #: get first page to initialize session + job_list = self.scrape_page(scraper_input, 1, session) - params = { - "search": scraper_input.search_term, - "location": scraper_input.location, - "radius": scraper_input.distance, - "refine_by_location_type": "only_remote" - if scraper_input.is_remote - else None, - "refine_by_employment": f"employment_type:employment_type:{job_type_value}" - if job_type_value - else None, - "page": page, - } + with ThreadPoolExecutor(max_workers=10) as executor: + futures: list[Future] = [ + executor.submit( + self.scrape_page, scraper_input, page, session + ) for page in range(2, pages_to_process + 1) + ] - response = session.get( - self.url, headers=ZipRecruiterScraper.headers(), params=params + for future in futures: + result = future.result() + + job_list += result + + except StatusException as e: + return JobResponse( + success=False, + error=f"ZipRecruiter returned status code {e.status_code}", ) - print(response.url) - if response.status_code != status.HTTP_200_OK: - return JobResponse( - success=False, - error=f"Response returned {response.status_code}", - ) - html_string = response.content - soup = BeautifulSoup(html_string, "html.parser") - if page == 1: - script_tag = soup.find("script", {"id": "js_variables"}) - data = json.loads(script_tag.string) + #: note: this does not handle if the results are more or less than the results_wanted - job_count = data["totalJobCount"] - job_count = int(job_count.replace(",", "")) - - job_posts = soup.find_all("div", {"class": "job_content"}) - - for job in job_posts: - processed_jobs += 1 - job_url = job.find("a", {"class": "job_link"})["href"] - if job_url in seen_urls: - continue - title = job.find("h2", {"class": "title"}).text - company = job.find("a", {"class": "company_name"}).text.strip() - description = job.find("p", {"class": "job_snippet"}).text.strip() - job_type_element = job.find("li", {"class": "perk_item perk_type"}) - - if job_type_element: - job_type_text = ( - job_type_element.text.strip() - .lower() - .replace("-", "") - .replace(" ", "") - ) - if job_type_text == "contractor": - job_type_text = "contract" - job_type = JobType(job_type_text) - else: - job_type = None - - date_posted = ZipRecruiterScraper.get_date_posted(job) - - job_post = JobPost( - title=title, - description=description, - company_name=company, - location=ZipRecruiterScraper.get_location(job), - job_type=job_type, - compensation=ZipRecruiterScraper.get_compensation(job), - date_posted=date_posted, - job_url=job_url, - ) - job_list.append(job_post) - if ( - len(job_list) >= scraper_input.results_wanted - or processed_jobs >= job_count - ): - break - - if ( - len(job_list) >= scraper_input.results_wanted - or processed_jobs >= job_count - ): - break - - page += 1 - - job_list = job_list[: scraper_input.results_wanted] + #: job_list = job_list[:scraper_input.results_wanted] job_response = JobResponse( success=True, jobs=job_list, - job_count=job_count, + job_count=len(job_list), ) return job_response From 59f07808316de3693710c5bf2034038fca127f73 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Tue, 11 Jul 2023 11:02:46 -0500 Subject: [PATCH 2/5] - indeed parallel job search --- api/core/scrapers/__init__.py | 5 + api/core/scrapers/indeed/__init__.py | 243 +++++++++++---------- api/core/scrapers/ziprecruiter/__init__.py | 7 +- 3 files changed, 135 insertions(+), 120 deletions(-) diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index 94d3dd4..6de3894 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -1,6 +1,11 @@ from ..jobs import * +class StatusException(Exception): + def __init__(self, status_code: int): + self.status_code = status_code + + class Site(Enum): LINKEDIN = "linkedin" INDEED = "indeed" diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 3aa6a88..41c8970 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -8,7 +8,9 @@ from bs4.element import Tag from fastapi import status from api.core.jobs import * -from api.core.scrapers import Scraper, ScraperInput, Site +from api.core.scrapers import Scraper, ScraperInput, Site, StatusException + +from concurrent.futures import ThreadPoolExecutor, Future class ParsingException(Exception): @@ -25,6 +27,99 @@ class IndeedScraper(Scraper): self.url = "https://www.indeed.com/jobs" self.job_url = "https://www.indeed.com/viewjob?jk=" + self.jobs_per_page = 15 + self.seen_urls = set() + + def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]: + job_list = [] + + params = { + "q": scraper_input.search_term, + "location": scraper_input.location, + "radius": scraper_input.distance, + "filter": 0, + "start": 0 + page * 10, + } + sc_values = [] + if scraper_input.is_remote: + sc_values.append("attr(DSQF7)") + if scraper_input.job_type: + sc_values.append("jt({})".format(scraper_input.job_type.value)) + + if sc_values: + params["sc"] = "0kf:" + "".join(sc_values) + ";" + response = session.get(self.url, params=params) + + if ( + response.status_code != status.HTTP_200_OK + and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT + ): + raise StatusException(response.status_code) + + soup = BeautifulSoup(response.content, "html.parser") + + jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function + + #: total_num_jobs = IndeedScraper.total_jobs(soup) #: for now + + if ( + not jobs.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results") + ): + raise Exception('No jobs found.') + + for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: + job_url = f'{self.job_url}{job["jobkey"]}' + if job_url in self.seen_urls: + continue + + snippet_html = BeautifulSoup(job["snippet"], "html.parser") + + extracted_salary = job.get("extractedSalary") + compensation = None + if extracted_salary: + salary_snippet = job.get("salarySnippet") + currency = ( + salary_snippet.get("currency") if salary_snippet else None + ) + interval = (extracted_salary.get("type"),) + if isinstance(interval, tuple): + interval = interval[0] + + interval = interval.upper() + if interval in CompensationInterval.__members__: + compensation = Compensation( + interval=CompensationInterval[interval], + min_amount=extracted_salary.get("max"), + max_amount=extracted_salary.get("min"), + currency=currency, + ) + + job_type = IndeedScraper.get_job_type(job) + timestamp_seconds = job["pubDate"] / 1000 + date_posted = datetime.fromtimestamp(timestamp_seconds) + + first_li = snippet_html.find("li") + job_post = JobPost( + title=job["normTitle"], + description=first_li.text if first_li else None, + company_name=job["company"], + location=Location( + city=job.get("jobLocationCity"), + state=job.get("jobLocationState"), + postal_code=job.get("jobLocationPostal"), + country="US", + ), + job_type=job_type, + compensation=compensation, + date_posted=date_posted, + job_url=job_url, + ) + job_list.append(job_post) + + return job_list + def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ Scrapes Indeed for jobs with scraper_input criteria @@ -35,125 +130,45 @@ class IndeedScraper(Scraper): client_identifier="chrome112", random_tls_extension_order=True ) - job_list: list[JobPost] = [] - page = 0 - processed_jobs, total_num_jobs = 0, 0 - seen_urls = set() - while len(job_list) < scraper_input.results_wanted: - params = { - "q": scraper_input.search_term, - "location": scraper_input.location, - "radius": scraper_input.distance, - "filter": 0, - "start": 0 + page * 10, - } - sc_values = [] - if scraper_input.is_remote: - sc_values.append("attr(DSQF7)") - if scraper_input.job_type: - sc_values.append("jt({})".format(scraper_input.job_type.value)) + pages_to_process = scraper_input.results_wanted // self.jobs_per_page - if sc_values: - params["sc"] = "0kf:" + "".join(sc_values) + ";" - response = session.get(self.url, params=params) + try: + #: get first page to initialize session + job_list = self.scrape_page(scraper_input, 0, session) - if ( - response.status_code != status.HTTP_200_OK - and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT - ): - return JobResponse( - success=False, - error=f"Response returned {response.status_code}", - ) + with ThreadPoolExecutor(max_workers=10) as executor: + futures: list[Future] = [ + executor.submit( + self.scrape_page, scraper_input, page, session + ) for page in range(1, pages_to_process + 1) + ] - soup = BeautifulSoup(response.content, "html.parser") + for future in futures: + result = future.result() - try: - jobs = IndeedScraper.parse_jobs(soup) - except ParsingException: - return JobResponse( - success=False, - error="Failed to parse jobs.", - ) + job_list += result - total_num_jobs = IndeedScraper.total_jobs(soup) + except StatusException as e: + return JobResponse( + success=False, + error=f"Indeed returned status code {e.status_code}", + ) + except ParsingException as e: + return JobResponse( + success=False, + error=f"Indeed failed to parse response: {e}", + ) + except Exception as e: + return JobResponse( + success=False, + error=f"Indeed failed to scrape: {e}", + ) - if ( - not jobs.get("metaData", {}) - .get("mosaicProviderJobCardsModel", {}) - .get("results") - ): - return JobResponse( - success=False, - error="No jobs found", - ) - - for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: - processed_jobs += 1 - job_url = f'{self.job_url}{job["jobkey"]}' - if job_url in seen_urls: - continue - snippet_html = BeautifulSoup(job["snippet"], "html.parser") - - extracted_salary = job.get("extractedSalary") - compensation = None - if extracted_salary: - salary_snippet = job.get("salarySnippet") - currency = ( - salary_snippet.get("currency") if salary_snippet else None - ) - interval = (extracted_salary.get("type"),) - if isinstance(interval, tuple): - interval = interval[0] - - interval = interval.upper() - if interval in CompensationInterval.__members__: - compensation = Compensation( - interval=CompensationInterval[interval], - min_amount=extracted_salary.get("max"), - max_amount=extracted_salary.get("min"), - currency=currency, - ) - - job_type = IndeedScraper.get_job_type(job) - timestamp_seconds = job["pubDate"] / 1000 - date_posted = datetime.fromtimestamp(timestamp_seconds) - - first_li = snippet_html.find("li") - job_post = JobPost( - title=job["normTitle"], - description=first_li.text if first_li else None, - company_name=job["company"], - location=Location( - city=job.get("jobLocationCity"), - state=job.get("jobLocationState"), - postal_code=job.get("jobLocationPostal"), - country="US", - ), - job_type=job_type, - compensation=compensation, - date_posted=date_posted, - job_url=job_url, - ) - job_list.append(job_post) - if ( - len(job_list) >= scraper_input.results_wanted - or processed_jobs >= total_num_jobs - ): - break - - if ( - len(job_list) >= scraper_input.results_wanted - or processed_jobs >= total_num_jobs - ): - break - page += 1 - - job_list = job_list[: scraper_input.results_wanted] + #: job_list = job_list[: scraper_input.results_wanted] job_response = JobResponse( success=True, jobs=job_list, - job_count=total_num_jobs, + job_count=len(job_list), ) return job_response @@ -192,9 +207,9 @@ class IndeedScraper(Scraper): script_tags = soup.find_all("script") for tag in script_tags: if ( - tag.string - and "mosaic.providerData" in tag.string - and "mosaic-provider-jobcards" in tag.string + tag.string + and "mosaic.providerData" in tag.string + and "mosaic-provider-jobcards" in tag.string ): return tag return None diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 21abd6e..364090f 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -7,15 +7,10 @@ from fastapi import status from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, Future -from api.core.scrapers import Scraper, ScraperInput, Site +from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from api.core.jobs import * -class StatusException(Exception): - def __init__(self, status_code: int): - self.status_code = status_code - - class ZipRecruiterScraper(Scraper): def __init__(self): """ From 16ddb9b4852735f74d2e274f97107d86610f0051 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Tue, 11 Jul 2023 12:00:24 -0500 Subject: [PATCH 3/5] - total vs returned schema definition - scraper bug fixes --- api/core/jobs/__init__.py | 14 ++++++-- api/core/scrapers/indeed/__init__.py | 39 ++++++++++++++++------ api/core/scrapers/ziprecruiter/__init__.py | 35 +++++++++++++------ 3 files changed, 64 insertions(+), 24 deletions(-) diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 9b0a9b6..622f110 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -2,7 +2,7 @@ from typing import Union from datetime import datetime from enum import Enum -from pydantic import BaseModel +from pydantic import BaseModel, validator class JobType(Enum): @@ -57,5 +57,15 @@ class JobResponse(BaseModel): success: bool error: str = None - job_count: int = None jobs: list[JobPost] = [] + + total_results: int = None + returned_results: int = None + + @validator("returned_results") + def set_returned_results(cls, v, values): + if v is None and values.get("jobs"): + return len(values["jobs"]) + return v + + diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 41c8970..9751bc7 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -1,6 +1,6 @@ import re import json -from typing import Optional +from typing import Optional, Tuple, List import tls_client from bs4 import BeautifulSoup @@ -8,9 +8,11 @@ from bs4.element import Tag from fastapi import status from api.core.jobs import * +from api.core.jobs import JobPost from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from concurrent.futures import ThreadPoolExecutor, Future +import math class ParsingException(Exception): @@ -30,7 +32,21 @@ class IndeedScraper(Scraper): self.jobs_per_page = 15 self.seen_urls = set() - def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]: + def scrape_page( + self, + scraper_input: ScraperInput, + page: int, + session: tls_client.Session + ) -> tuple[list[JobPost], int]: + + """ + Scrapes a page of Indeed for jobs with scraper_input criteria + :param scraper_input: + :param page: + :param session: + :return: jobs found on page, total number of jobs found for search + """ + job_list = [] params = { @@ -59,8 +75,7 @@ class IndeedScraper(Scraper): soup = BeautifulSoup(response.content, "html.parser") jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function - - #: total_num_jobs = IndeedScraper.total_jobs(soup) #: for now + total_num_jobs = IndeedScraper.total_jobs(soup) if ( not jobs.get("metaData", {}) @@ -118,7 +133,7 @@ class IndeedScraper(Scraper): ) job_list.append(job_post) - return job_list + return job_list, total_num_jobs def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -130,11 +145,11 @@ class IndeedScraper(Scraper): client_identifier="chrome112", random_tls_extension_order=True ) - pages_to_process = scraper_input.results_wanted // self.jobs_per_page + pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 try: #: get first page to initialize session - job_list = self.scrape_page(scraper_input, 0, session) + job_list, total_results = self.scrape_page(scraper_input, 0, session) with ThreadPoolExecutor(max_workers=10) as executor: futures: list[Future] = [ @@ -144,9 +159,9 @@ class IndeedScraper(Scraper): ] for future in futures: - result = future.result() + jobs, _ = future.result() - job_list += result + job_list += jobs except StatusException as e: return JobResponse( @@ -164,11 +179,13 @@ class IndeedScraper(Scraper): error=f"Indeed failed to scrape: {e}", ) - #: job_list = job_list[: scraper_input.results_wanted] + if len(job_list) > scraper_input.results_wanted: + job_list = job_list[:scraper_input.results_wanted] + job_response = JobResponse( success=True, jobs=job_list, - job_count=len(job_list), + total_results=total_results, ) return job_response diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 364090f..712cd96 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -1,5 +1,5 @@ import json -from typing import Optional +from typing import Optional, Tuple, List from urllib.parse import urlparse, parse_qs import tls_client @@ -7,8 +7,10 @@ from fastapi import status from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, Future +from api.core.jobs import JobPost from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from api.core.jobs import * +import math class ZipRecruiterScraper(Scraper): @@ -23,13 +25,19 @@ class ZipRecruiterScraper(Scraper): self.jobs_per_page = 20 self.seen_urls = set() - def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]: + def scrape_page( + self, + scraper_input: ScraperInput, + page: int, + session: tls_client.Session + ) -> tuple[list[JobPost], int | None]: + """ Scrapes a page of ZipRecruiter for jobs with scraper_input criteria :param scraper_input: :param page: :param session: - :return: + :return: jobs found on page, total number of jobs found for search """ job_list = [] @@ -69,7 +77,9 @@ class ZipRecruiterScraper(Scraper): script_tag = soup.find("script", {"id": "js_variables"}) data = json.loads(script_tag.string) - #: job_count = int(data["totalJobCount"].replace(",", "")) + job_count = int(data["totalJobCount"].replace(",", "")) + else: + job_count = None job_posts = soup.find_all("div", {"class": "job_content"}) @@ -110,7 +120,7 @@ class ZipRecruiterScraper(Scraper): ) job_list.append(job_post) - return job_list + return job_list, job_count def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -121,11 +131,12 @@ class ZipRecruiterScraper(Scraper): session = tls_client.Session( client_identifier="chrome112", random_tls_extension_order=True ) - pages_to_process = scraper_input.results_wanted // self.jobs_per_page + + pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page) try: #: get first page to initialize session - job_list = self.scrape_page(scraper_input, 1, session) + job_list, total_results = self.scrape_page(scraper_input, 1, session) with ThreadPoolExecutor(max_workers=10) as executor: futures: list[Future] = [ @@ -135,9 +146,9 @@ class ZipRecruiterScraper(Scraper): ] for future in futures: - result = future.result() + jobs, _ = future.result() - job_list += result + job_list += jobs except StatusException as e: return JobResponse( @@ -147,11 +158,13 @@ class ZipRecruiterScraper(Scraper): #: note: this does not handle if the results are more or less than the results_wanted - #: job_list = job_list[:scraper_input.results_wanted] + if len(job_list) > scraper_input.results_wanted: + job_list = job_list[:scraper_input.results_wanted] + job_response = JobResponse( success=True, jobs=job_list, - job_count=len(job_list), + total_results=total_results, ) return job_response From 804646d91b6a8f4d608c2a555fe030ca96380c53 Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Tue, 11 Jul 2023 12:02:50 -0500 Subject: [PATCH 4/5] - linkedin schema migration --- api/core/scrapers/linkedin/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index fd26bb5..eed6b5c 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -131,7 +131,7 @@ class LinkedInScraper(Scraper): job_response = JobResponse( success=True, jobs=job_list, - job_count=job_count, + total_results=job_count, ) return job_response From 05b54190a0c30da4481e8815272ab8b3dd8c11f5 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Tue, 11 Jul 2023 12:24:04 -0500 Subject: [PATCH 5/5] refactor(JobRespons): place result count near top of response --- api/core/jobs/__init__.py | 6 +-- api/core/scrapers/indeed/__init__.py | 43 ++++++++++------------ api/core/scrapers/ziprecruiter/__init__.py | 13 ++----- 3 files changed, 26 insertions(+), 36 deletions(-) diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 622f110..cbf15e8 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -57,15 +57,13 @@ class JobResponse(BaseModel): success: bool error: str = None - jobs: list[JobPost] = [] - total_results: int = None returned_results: int = None + jobs: list[JobPost] = [] + @validator("returned_results") def set_returned_results(cls, v, values): if v is None and values.get("jobs"): return len(values["jobs"]) return v - - diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 9751bc7..40d4601 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -33,12 +33,8 @@ class IndeedScraper(Scraper): self.seen_urls = set() def scrape_page( - self, - scraper_input: ScraperInput, - page: int, - session: tls_client.Session + self, scraper_input: ScraperInput, page: int, session: tls_client.Session ) -> tuple[list[JobPost], int]: - """ Scrapes a page of Indeed for jobs with scraper_input criteria :param scraper_input: @@ -67,22 +63,24 @@ class IndeedScraper(Scraper): response = session.get(self.url, params=params) if ( - response.status_code != status.HTTP_200_OK - and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT + response.status_code != status.HTTP_200_OK + and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT ): raise StatusException(response.status_code) soup = BeautifulSoup(response.content, "html.parser") - jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function + jobs = IndeedScraper.parse_jobs( + soup + ) #: can raise exception, handled by main scrape function total_num_jobs = IndeedScraper.total_jobs(soup) if ( - not jobs.get("metaData", {}) - .get("mosaicProviderJobCardsModel", {}) - .get("results") + not jobs.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results") ): - raise Exception('No jobs found.') + raise Exception("No jobs found.") for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: job_url = f'{self.job_url}{job["jobkey"]}' @@ -95,9 +93,7 @@ class IndeedScraper(Scraper): compensation = None if extracted_salary: salary_snippet = job.get("salarySnippet") - currency = ( - salary_snippet.get("currency") if salary_snippet else None - ) + currency = salary_snippet.get("currency") if salary_snippet else None interval = (extracted_salary.get("type"),) if isinstance(interval, tuple): interval = interval[0] @@ -145,7 +141,9 @@ class IndeedScraper(Scraper): client_identifier="chrome112", random_tls_extension_order=True ) - pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 + pages_to_process = ( + math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 + ) try: #: get first page to initialize session @@ -153,9 +151,8 @@ class IndeedScraper(Scraper): with ThreadPoolExecutor(max_workers=10) as executor: futures: list[Future] = [ - executor.submit( - self.scrape_page, scraper_input, page, session - ) for page in range(1, pages_to_process + 1) + executor.submit(self.scrape_page, scraper_input, page, session) + for page in range(1, pages_to_process + 1) ] for future in futures: @@ -180,7 +177,7 @@ class IndeedScraper(Scraper): ) if len(job_list) > scraper_input.results_wanted: - job_list = job_list[:scraper_input.results_wanted] + job_list = job_list[: scraper_input.results_wanted] job_response = JobResponse( success=True, @@ -224,9 +221,9 @@ class IndeedScraper(Scraper): script_tags = soup.find_all("script") for tag in script_tags: if ( - tag.string - and "mosaic.providerData" in tag.string - and "mosaic-provider-jobcards" in tag.string + tag.string + and "mosaic.providerData" in tag.string + and "mosaic-provider-jobcards" in tag.string ): return tag return None diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 712cd96..837d237 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -26,12 +26,8 @@ class ZipRecruiterScraper(Scraper): self.seen_urls = set() def scrape_page( - self, - scraper_input: ScraperInput, - page: int, - session: tls_client.Session + self, scraper_input: ScraperInput, page: int, session: tls_client.Session ) -> tuple[list[JobPost], int | None]: - """ Scrapes a page of ZipRecruiter for jobs with scraper_input criteria :param scraper_input: @@ -140,9 +136,8 @@ class ZipRecruiterScraper(Scraper): with ThreadPoolExecutor(max_workers=10) as executor: futures: list[Future] = [ - executor.submit( - self.scrape_page, scraper_input, page, session - ) for page in range(2, pages_to_process + 1) + executor.submit(self.scrape_page, scraper_input, page, session) + for page in range(2, pages_to_process + 1) ] for future in futures: @@ -159,7 +154,7 @@ class ZipRecruiterScraper(Scraper): #: note: this does not handle if the results are more or less than the results_wanted if len(job_list) > scraper_input.results_wanted: - job_list = job_list[:scraper_input.results_wanted] + job_list = job_list[: scraper_input.results_wanted] job_response = JobResponse( success=True,