From ccd18595fe213d5e15c8bf36d46aa5a4af0543b3 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 9 Feb 2024 12:04:14 -0600 Subject: [PATCH] fix(indeed): return no jobs instead of error --- src/jobspy/__init__.py | 5 ++- src/jobspy/scrapers/indeed/__init__.py | 43 ++++++++++++++++---------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index cf0222b..5d96b11 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -1,7 +1,6 @@ import pandas as pd from typing import Tuple -import concurrent.futures -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed from .jobs import JobType, Location from .scrapers.indeed import IndeedScraper @@ -119,7 +118,7 @@ def scrape_jobs( executor.submit(worker, site): site for site in scraper_input.site_type } - for future in concurrent.futures.as_completed(future_to_site): + for future in as_completed(future_to_site): site_value, scraped_data = future.result() site_to_jobs_dict[site_value] = scraped_data diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 7dcc52f..695719a 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -80,13 +80,14 @@ class IndeedScraper(Scraper): raise IndeedException(str(e)) soup = BeautifulSoup(response.content, "html.parser") + job_list = [] + total_num_jobs = IndeedScraper.total_jobs(soup) if "did not match any jobs" in response.text: - raise IndeedException("Parsing exception: Search did not match any jobs") + return job_list, total_num_jobs jobs = IndeedScraper.parse_jobs( soup ) #: can raise exception, handled by main scrape function - total_num_jobs = IndeedScraper.total_jobs(soup) if ( not jobs.get("metaData", {}) @@ -152,26 +153,34 @@ class IndeedScraper(Scraper): :param scraper_input: :return: job_response """ - pages_to_process = ( - math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 - ) - - #: get first page to initialize session job_list, total_results = self.scrape_page(scraper_input, 0) + pages_processed = 1 - with ThreadPoolExecutor(max_workers=10) as executor: - futures: list[Future] = [ - executor.submit(self.scrape_page, scraper_input, page) - for page in range(1, pages_to_process + 1) - ] + while len(self.seen_urls) < scraper_input.results_wanted: + pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page) + new_jobs = False - for future in futures: - jobs, _ = future.result() + with ThreadPoolExecutor(max_workers=10) as executor: + futures: list[Future] = [ + executor.submit(self.scrape_page, scraper_input, page + pages_processed) + for page in range(pages_to_process) + ] - job_list += jobs + for future in futures: + jobs, _ = future.result() + if jobs: + job_list += jobs + new_jobs = True + if len(self.seen_urls) >= scraper_input.results_wanted: + break - if len(job_list) > scraper_input.results_wanted: - job_list = job_list[: scraper_input.results_wanted] + pages_processed += pages_to_process + if not new_jobs: + break + + + if len(self.seen_urls) > scraper_input.results_wanted: + job_list = job_list[:scraper_input.results_wanted] job_response = JobResponse( jobs=job_list,