From 444f65ebbc4c48791174f73e701ee954ef7f09cc Mon Sep 17 00:00:00 2001 From: VitaminB16 Date: Sat, 9 Mar 2024 19:58:23 +0000 Subject: [PATCH] format: jobspy/scrapers/glassdoor --- src/jobspy/scrapers/glassdoor/__init__.py | 103 +++++++++++++--------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 3b85789..0d85aa6 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -4,23 +4,23 @@ jobspy.scrapers.glassdoor This module contains routines to scrape Glassdoor. """ + from __future__ import annotations -import json import re - +import json import requests -from typing import Optional +from typing import Optional, Tuple from datetime import datetime, timedelta from concurrent.futures import ThreadPoolExecutor, as_completed -from ..utils import extract_emails_from_text from .. import Scraper, ScraperInput, Site +from ..utils import extract_emails_from_text from ..exceptions import GlassdoorException from ..utils import ( create_session, markdown_converter, - logger + logger, ) from ...jobs import ( JobPost, @@ -29,7 +29,7 @@ from ...jobs import ( Location, JobResponse, JobType, - DescriptionFormat + DescriptionFormat, ) @@ -61,25 +61,22 @@ class GlassdoorScraper(Scraper): self.session = create_session(self.proxy, is_tls=True, has_retry=True) token = self._get_csrf_token() - self.headers['gd-csrf-token'] = token if token else self.fallback_token + self.headers["gd-csrf-token"] = token if token else self.fallback_token location_id, location_type = self._get_location( scraper_input.location, scraper_input.is_remote ) if location_type is None: - logger.error('Glassdoor: location not parsed') + logger.error("Glassdoor: location not parsed") return JobResponse(jobs=[]) all_jobs: list[JobPost] = [] cursor = None - for page in range( - 1 + (scraper_input.offset // self.jobs_per_page), - min( - (scraper_input.results_wanted // self.jobs_per_page) + 2, - self.max_pages + 1, - ), - ): - logger.info(f'Glassdoor search page: {page}') + range_start = 1 + (scraper_input.offset // self.jobs_per_page) + tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 + range_end = min(tot_pages, self.max_pages + 1) + for page in range(range_start, range_end): + logger.info(f"Glassdoor search page: {page}") try: jobs, cursor = self._fetch_jobs_page( scraper_input, location_id, location_type, page, cursor @@ -89,7 +86,7 @@ class GlassdoorScraper(Scraper): all_jobs = all_jobs[: scraper_input.results_wanted] break except Exception as e: - logger.error(f'Glassdoor: {str(e)}') + logger.error(f"Glassdoor: {str(e)}") break return JobResponse(jobs=all_jobs) @@ -100,39 +97,48 @@ class GlassdoorScraper(Scraper): location_type: str, page_num: int, cursor: str | None, - ) -> (list[JobPost], str | None): + ) -> Tuple[list[JobPost], str | None]: """ Scrapes a page of Glassdoor for jobs with scraper_input criteria """ jobs = [] self.scraper_input = scraper_input try: - payload = self._add_payload( - location_id, location_type, page_num, cursor - ) + payload = self._add_payload(location_id, location_type, page_num, cursor) response = self.session.post( - f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload + f"{self.base_url}/graph", + headers=self.headers, + timeout_seconds=15, + data=payload, ) if response.status_code != 200: - raise GlassdoorException(f"bad response status code: {response.status_code}") + exc_msg = f"bad response status code: {response.status_code}" + raise GlassdoorException(exc_msg) res_json = response.json()[0] if "errors" in res_json: raise ValueError("Error encountered in API response") - except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e: - logger.error(f'Glassdoor: {str(e)}') + except ( + requests.exceptions.ReadTimeout, + GlassdoorException, + ValueError, + Exception, + ) as e: + logger.error(f"Glassdoor: {str(e)}") return jobs, None jobs_data = res_json["data"]["jobListings"]["jobListings"] with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: - future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data} + future_to_job_data = { + executor.submit(self._process_job, job): job for job in jobs_data + } for future in as_completed(future_to_job_data): try: job_post = future.result() if job_post: jobs.append(job_post) except Exception as exc: - raise GlassdoorException(f'Glassdoor generated an exception: {exc}') + raise GlassdoorException(f"Glassdoor generated an exception: {exc}") return jobs, self.get_cursor_for_page( res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 @@ -142,7 +148,9 @@ class GlassdoorScraper(Scraper): """ Fetches csrf token needed for API by visiting a generic page """ - res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers) + res = self.session.get( + f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers + ) pattern = r'"token":\s*"([^"]+)"' matches = re.findall(pattern, res.text) token = None @@ -155,19 +163,20 @@ class GlassdoorScraper(Scraper): Processes a single job and fetches its description. """ job_id = job_data["jobview"]["job"]["listingId"] - job_url = f'{self.base_url}job-listing/j?jl={job_id}' + job_url = f"{self.base_url}job-listing/j?jl={job_id}" if job_url in self.seen_urls: return None self.seen_urls.add(job_url) job = job_data["jobview"] title = job["job"]["jobTitleText"] company_name = job["header"]["employerNameFromSearch"] - company_id = job_data['jobview']['header']['employer']['id'] + company_id = job_data["jobview"]["header"]["employer"]["id"] location_name = job["header"].get("locationName", "") location_type = job["header"].get("locationType", "") age_in_days = job["header"].get("ageInDays") is_remote, location = False, None - date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None + date_diff = (datetime.now() - timedelta(days=age_in_days)).date() + date_posted = date_diff if age_in_days is not None else None if location_type == "S": is_remote = True @@ -179,9 +188,10 @@ class GlassdoorScraper(Scraper): description = self._fetch_job_description(job_id) except: description = None + company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm" return JobPost( title=title, - company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None, + company_url=company_url if company_id else None, company_name=company_name, date_posted=date_posted, job_url=job_url, @@ -203,7 +213,7 @@ class GlassdoorScraper(Scraper): "variables": { "jl": job_id, "queryString": "q", - "pageTypeEnum": "SERP" + "pageTypeEnum": "SERP", }, "query": """ query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) { @@ -218,15 +228,17 @@ class GlassdoorScraper(Scraper): __typename } } - """ + """, } ] res = requests.post(url, json=body, headers=self.headers) if res.status_code != 200: return None data = res.json()[0] - desc = data['data']['jobview']['job']['description'] - return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc + desc = data["data"]["jobview"]["job"]["description"] + if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: + desc = markdown_converter(desc) + return desc def _get_location(self, location: str, is_remote: bool) -> (int, str): if not location or is_remote: @@ -236,10 +248,13 @@ class GlassdoorScraper(Scraper): res = self.session.get(url, headers=self.headers) if res.status_code != 200: if res.status_code == 429: - logger.error(f'429 Response - Blocked by Glassdoor for too many requests') + err = f"429 Response - Blocked by Glassdoor for too many requests" + logger.error(err) return None, None else: - logger.error(f'Glassdoor response status code {res.status_code}') + err = f"Glassdoor response status code {res.status_code}" + err += f" - {res.text}" + logger.error(f"Glassdoor response status code {res.status_code}") return None, None items = res.json() @@ -250,7 +265,7 @@ class GlassdoorScraper(Scraper): location_type = "CITY" elif location_type == "S": location_type = "STATE" - elif location_type == 'N': + elif location_type == "N": location_type = "COUNTRY" return int(items[0]["locationId"]), location_type @@ -261,7 +276,9 @@ class GlassdoorScraper(Scraper): page_num: int, cursor: str | None = None, ) -> str: - fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None + fromage = None + if self.scraper_input.hours_old: + fromage = max(self.scraper_input.hours_old // 24, 1) filter_params = [] if self.scraper_input.easy_apply: filter_params.append({"filterKey": "applicationType", "values": "1"}) @@ -280,9 +297,9 @@ class GlassdoorScraper(Scraper): "pageNumber": page_num, "pageCursor": cursor, "fromage": fromage, - "sort": "date" + "sort": "date", }, - "query": self.query_template + "query": self.query_template, } if self.scraper_input.job_type: payload["variables"]["filterParams"].append( @@ -514,4 +531,4 @@ class GlassdoorScraper(Scraper): } __typename } - """ \ No newline at end of file + """