format: jobspy/scrapers/glassdoor

pull/127/head
VitaminB16 2024-03-09 19:58:23 +00:00
parent 63790da29f
commit 444f65ebbc
1 changed files with 60 additions and 43 deletions

View File

@ -4,23 +4,23 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
from __future__ import annotations from __future__ import annotations
import json
import re import re
import json
import requests import requests
from typing import Optional from typing import Optional, Tuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from ..utils import extract_emails_from_text
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import ( from ..utils import (
create_session, create_session,
markdown_converter, markdown_converter,
logger logger,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -29,7 +29,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat DescriptionFormat,
) )
@ -61,25 +61,22 @@ class GlassdoorScraper(Scraper):
self.session = create_session(self.proxy, is_tls=True, has_retry=True) self.session = create_session(self.proxy, is_tls=True, has_retry=True)
token = self._get_csrf_token() token = self._get_csrf_token()
self.headers['gd-csrf-token'] = token if token else self.fallback_token self.headers["gd-csrf-token"] = token if token else self.fallback_token
location_id, location_type = self._get_location( location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote scraper_input.location, scraper_input.is_remote
) )
if location_type is None: if location_type is None:
logger.error('Glassdoor: location not parsed') logger.error("Glassdoor: location not parsed")
return JobResponse(jobs=[]) return JobResponse(jobs=[])
all_jobs: list[JobPost] = [] all_jobs: list[JobPost] = []
cursor = None cursor = None
for page in range( range_start = 1 + (scraper_input.offset // self.jobs_per_page)
1 + (scraper_input.offset // self.jobs_per_page), tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
min( range_end = min(tot_pages, self.max_pages + 1)
(scraper_input.results_wanted // self.jobs_per_page) + 2, for page in range(range_start, range_end):
self.max_pages + 1, logger.info(f"Glassdoor search page: {page}")
),
):
logger.info(f'Glassdoor search page: {page}')
try: try:
jobs, cursor = self._fetch_jobs_page( jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor scraper_input, location_id, location_type, page, cursor
@ -89,7 +86,7 @@ class GlassdoorScraper(Scraper):
all_jobs = all_jobs[: scraper_input.results_wanted] all_jobs = all_jobs[: scraper_input.results_wanted]
break break
except Exception as e: except Exception as e:
logger.error(f'Glassdoor: {str(e)}') logger.error(f"Glassdoor: {str(e)}")
break break
return JobResponse(jobs=all_jobs) return JobResponse(jobs=all_jobs)
@ -100,39 +97,48 @@ class GlassdoorScraper(Scraper):
location_type: str, location_type: str,
page_num: int, page_num: int,
cursor: str | None, cursor: str | None,
) -> (list[JobPost], str | None): ) -> Tuple[list[JobPost], str | None]:
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
jobs = [] jobs = []
self.scraper_input = scraper_input self.scraper_input = scraper_input
try: try:
payload = self._add_payload( payload = self._add_payload(location_id, location_type, page_num, cursor)
location_id, location_type, page_num, cursor
)
response = self.session.post( response = self.session.post(
f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload f"{self.base_url}/graph",
headers=self.headers,
timeout_seconds=15,
data=payload,
) )
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException(f"bad response status code: {response.status_code}") exc_msg = f"bad response status code: {response.status_code}"
raise GlassdoorException(exc_msg)
res_json = response.json()[0] res_json = response.json()[0]
if "errors" in res_json: if "errors" in res_json:
raise ValueError("Error encountered in API response") raise ValueError("Error encountered in API response")
except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e: except (
logger.error(f'Glassdoor: {str(e)}') requests.exceptions.ReadTimeout,
GlassdoorException,
ValueError,
Exception,
) as e:
logger.error(f"Glassdoor: {str(e)}")
return jobs, None return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs_data = res_json["data"]["jobListings"]["jobListings"]
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data} future_to_job_data = {
executor.submit(self._process_job, job): job for job in jobs_data
}
for future in as_completed(future_to_job_data): for future in as_completed(future_to_job_data):
try: try:
job_post = future.result() job_post = future.result()
if job_post: if job_post:
jobs.append(job_post) jobs.append(job_post)
except Exception as exc: except Exception as exc:
raise GlassdoorException(f'Glassdoor generated an exception: {exc}') raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
return jobs, self.get_cursor_for_page( return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
@ -142,7 +148,9 @@ class GlassdoorScraper(Scraper):
""" """
Fetches csrf token needed for API by visiting a generic page Fetches csrf token needed for API by visiting a generic page
""" """
res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers) res = self.session.get(
f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
)
pattern = r'"token":\s*"([^"]+)"' pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text) matches = re.findall(pattern, res.text)
token = None token = None
@ -155,19 +163,20 @@ class GlassdoorScraper(Scraper):
Processes a single job and fetches its description. Processes a single job and fetches its description.
""" """
job_id = job_data["jobview"]["job"]["listingId"] job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.base_url}job-listing/j?jl={job_id}' job_url = f"{self.base_url}job-listing/j?jl={job_id}"
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
job = job_data["jobview"] job = job_data["jobview"]
title = job["job"]["jobTitleText"] title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"] company_name = job["header"]["employerNameFromSearch"]
company_id = job_data['jobview']['header']['employer']['id'] company_id = job_data["jobview"]["header"]["employer"]["id"]
location_name = job["header"].get("locationName", "") location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "") location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays") age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None date_diff = (datetime.now() - timedelta(days=age_in_days)).date()
date_posted = date_diff if age_in_days is not None else None
if location_type == "S": if location_type == "S":
is_remote = True is_remote = True
@ -179,9 +188,10 @@ class GlassdoorScraper(Scraper):
description = self._fetch_job_description(job_id) description = self._fetch_job_description(job_id)
except: except:
description = None description = None
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
return JobPost( return JobPost(
title=title, title=title,
company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None, company_url=company_url if company_id else None,
company_name=company_name, company_name=company_name,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
@ -203,7 +213,7 @@ class GlassdoorScraper(Scraper):
"variables": { "variables": {
"jl": job_id, "jl": job_id,
"queryString": "q", "queryString": "q",
"pageTypeEnum": "SERP" "pageTypeEnum": "SERP",
}, },
"query": """ "query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) { query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
@ -218,15 +228,17 @@ class GlassdoorScraper(Scraper):
__typename __typename
} }
} }
""" """,
} }
] ]
res = requests.post(url, json=body, headers=self.headers) res = requests.post(url, json=body, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
return None return None
data = res.json()[0] data = res.json()[0]
desc = data['data']['jobview']['job']['description'] desc = data["data"]["jobview"]["job"]["description"]
return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
desc = markdown_converter(desc)
return desc
def _get_location(self, location: str, is_remote: bool) -> (int, str): def _get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote: if not location or is_remote:
@ -236,10 +248,13 @@ class GlassdoorScraper(Scraper):
res = self.session.get(url, headers=self.headers) res = self.session.get(url, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:
logger.error(f'429 Response - Blocked by Glassdoor for too many requests') err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err)
return None, None return None, None
else: else:
logger.error(f'Glassdoor response status code {res.status_code}') err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}")
return None, None return None, None
items = res.json() items = res.json()
@ -250,7 +265,7 @@ class GlassdoorScraper(Scraper):
location_type = "CITY" location_type = "CITY"
elif location_type == "S": elif location_type == "S":
location_type = "STATE" location_type = "STATE"
elif location_type == 'N': elif location_type == "N":
location_type = "COUNTRY" location_type = "COUNTRY"
return int(items[0]["locationId"]), location_type return int(items[0]["locationId"]), location_type
@ -261,7 +276,9 @@ class GlassdoorScraper(Scraper):
page_num: int, page_num: int,
cursor: str | None = None, cursor: str | None = None,
) -> str: ) -> str:
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None fromage = None
if self.scraper_input.hours_old:
fromage = max(self.scraper_input.hours_old // 24, 1)
filter_params = [] filter_params = []
if self.scraper_input.easy_apply: if self.scraper_input.easy_apply:
filter_params.append({"filterKey": "applicationType", "values": "1"}) filter_params.append({"filterKey": "applicationType", "values": "1"})
@ -280,9 +297,9 @@ class GlassdoorScraper(Scraper):
"pageNumber": page_num, "pageNumber": page_num,
"pageCursor": cursor, "pageCursor": cursor,
"fromage": fromage, "fromage": fromage,
"sort": "date" "sort": "date",
}, },
"query": self.query_template "query": self.query_template,
} }
if self.scraper_input.job_type: if self.scraper_input.job_type:
payload["variables"]["filterParams"].append( payload["variables"]["filterParams"].append(
@ -514,4 +531,4 @@ class GlassdoorScraper(Scraper):
} }
__typename __typename
} }
""" """