From eb728a572a083336062f4dce8ffaffe45357b689 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sat, 26 Aug 2023 04:34:02 -0500 Subject: [PATCH] feat(ziprecruiter): Add multithreading for individual job handling --- api/core/scrapers/ziprecruiter/__init__.py | 24 +++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 6ac42e5..695d700 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -83,10 +83,15 @@ class ZipRecruiterScraper(Scraper): job_posts = soup.find_all("div", {"class": "job_content"}) - for job in job_posts: + def process_job(job: Tag) -> Optional[JobPost]: + ''' + Parses a job from the job content tag + :param job: BeautifulSoup Tag for one job post + :return JobPost + ''' job_url = job.find("a", {"class": "job_link"})["href"] if job_url in self.seen_urls: - continue + return None title = job.find("h2", {"class": "title"}).text company = job.find("a", {"class": "company_name"}).text.strip() @@ -121,7 +126,14 @@ class ZipRecruiterScraper(Scraper): date_posted=date_posted, job_url=job_url, ) - job_list.append(job_post) + return job_post + + with ThreadPoolExecutor(max_workers=10) as executor: + job_results: list[Future] = [ + executor.submit(process_job, job) for job in job_posts + ] + + job_list = [result.result() for result in job_results if result.result()] return job_list, job_count @@ -171,12 +183,14 @@ class ZipRecruiterScraper(Scraper): return job_response @classmethod - def get_description(cls, job_page_url: str, session: tls_client.Session) -> Tuple[Optional[str], str]: + def get_description( + cls, job_page_url: str, session: tls_client.Session + ) -> Tuple[Optional[str], str]: """ Retrieves job description by going to the job page url :param job_page_url: :param session: - :return: description or None, response url + :return: description or None, response url """ response = session.get( job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True