From ce3bd84ee5614d97ce39278752cd334f641b9467 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 2 Feb 2024 18:21:55 -0600 Subject: [PATCH] fix: indeed parse description bug (#96) * fix(indeed): full descr * chore: version --- pyproject.toml | 2 +- src/jobspy/scrapers/indeed/__init__.py | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 783de19..c3b8d0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.38" +version = "1.1.39" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index f1a714b..06a8752 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -154,8 +154,9 @@ class IndeedScraper(Scraper): ) return job_post + workers = 10 if scraper_input.full_description else 10 # possibly lessen 10 when fetching desc based on feedback jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] - with ThreadPoolExecutor(max_workers=1) as executor: + with ThreadPoolExecutor(max_workers=workers) as executor: job_results: list[Future] = [ executor.submit(process_job, job) for job in jobs ] @@ -206,7 +207,7 @@ class IndeedScraper(Scraper): parsed_url = urllib.parse.urlparse(job_page_url) params = urllib.parse.parse_qs(parsed_url.query) jk_value = params.get("jk", [None])[0] - formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" + formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1" session = create_session(self.proxy) try: @@ -223,10 +224,18 @@ class IndeedScraper(Scraper): return None try: - data = json.loads(response.text) - job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][ - "sanitizedJobDescription" - ] + soup = BeautifulSoup(response.text, 'html.parser') + script_tags = soup.find_all('script') + + job_description = '' + for tag in script_tags: + if 'window._initialData' in tag.text: + json_str = tag.text + json_str = json_str.split('window._initialData=')[1] + json_str = json_str.rsplit(';', 1)[0] + data = json.loads(json_str) + job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"] + break except (KeyError, TypeError, IndexError): return None