JobSpy/jobspy/glassdoor/__init__.py

321 lines
11 KiB
Python
Raw Normal View History

from __future__ import annotations
import re
import json
2024-01-22 18:22:32 -08:00
import requests
2025-02-21 12:14:55 -08:00
from typing import Tuple
2023-12-02 00:42:54 -08:00
from datetime import datetime, timedelta
2024-01-22 18:22:32 -08:00
from concurrent.futures import ThreadPoolExecutor, as_completed
2023-10-30 17:57:36 -07:00
2025-02-21 12:14:55 -08:00
from jobspy.glassdoor.constant import fallback_token, query_template, headers
from jobspy.glassdoor.util import (
get_cursor_for_page,
parse_compensation,
parse_location,
)
from jobspy.util import (
extract_emails_from_text,
create_logger,
2024-02-14 14:04:23 -08:00
create_session,
markdown_converter,
)
2025-02-21 12:14:55 -08:00
from jobspy.exception import GlassdoorException
from jobspy.model import (
2023-10-30 17:57:36 -07:00
JobPost,
JobResponse,
DescriptionFormat,
2025-02-21 12:14:55 -08:00
Scraper,
ScraperInput,
Site,
2023-10-30 17:57:36 -07:00
)
2025-02-21 10:29:28 -08:00
log = create_logger("Glassdoor")
2024-10-19 16:01:59 -07:00
2023-10-30 17:57:36 -07:00
2025-02-21 12:14:55 -08:00
class Glassdoor(Scraper):
2024-10-19 16:01:59 -07:00
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
2023-10-30 17:57:36 -07:00
"""
Initializes GlassdoorScraper with the Glassdoor job search url
"""
site = Site(Site.GLASSDOOR)
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
2023-10-30 17:57:36 -07:00
2024-02-14 14:04:23 -08:00
self.base_url = None
2023-10-30 17:57:36 -07:00
self.country = None
2024-02-12 09:02:48 -08:00
self.session = None
2024-02-14 14:04:23 -08:00
self.scraper_input = None
2023-10-30 17:57:36 -07:00
self.jobs_per_page = 30
self.max_pages = 30
2023-10-30 17:57:36 -07:00
self.seen_urls = set()
2024-02-14 14:04:23 -08:00
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Glassdoor for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url()
2024-10-19 16:01:59 -07:00
self.session = create_session(
2025-02-21 10:29:28 -08:00
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
2024-10-19 16:01:59 -07:00
)
token = self._get_csrf_token()
2024-10-19 16:01:59 -07:00
headers["gd-csrf-token"] = token if token else fallback_token
self.session.headers.update(headers)
2024-02-14 14:04:23 -08:00
location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
2025-02-21 10:29:28 -08:00
log.error("Glassdoor: location not parsed")
2024-02-14 14:04:23 -08:00
return JobResponse(jobs=[])
2024-06-09 15:45:38 -07:00
job_list: list[JobPost] = []
2024-02-14 14:04:23 -08:00
cursor = None
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end):
2025-02-21 10:29:28 -08:00
log.info(f"search page: {page} / {range_end - 1}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
2024-06-09 15:45:38 -07:00
job_list.extend(jobs)
if not jobs or len(job_list) >= scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
break
except Exception as e:
2025-02-21 10:29:28 -08:00
log.error(f"Glassdoor: {str(e)}")
break
2024-06-09 15:45:38 -07:00
return JobResponse(jobs=job_list)
2024-02-14 14:04:23 -08:00
def _fetch_jobs_page(
2023-10-30 17:57:36 -07:00
self,
scraper_input: ScraperInput,
location_id: int,
location_type: str,
page_num: int,
cursor: str | None,
) -> Tuple[list[JobPost], str | None]:
2023-10-30 17:57:36 -07:00
"""
Scrapes a page of Glassdoor for jobs with scraper_input criteria
"""
jobs = []
2024-02-14 14:04:23 -08:00
self.scraper_input = scraper_input
2023-10-30 17:57:36 -07:00
try:
payload = self._add_payload(location_id, location_type, page_num, cursor)
2024-02-12 09:02:48 -08:00
response = self.session.post(
f"{self.base_url}/graph",
timeout_seconds=15,
data=payload,
2023-10-30 17:57:36 -07:00
)
if response.status_code != 200:
exc_msg = f"bad response status code: {response.status_code}"
raise GlassdoorException(exc_msg)
2023-10-30 17:57:36 -07:00
res_json = response.json()[0]
if "errors" in res_json:
raise ValueError("Error encountered in API response")
except (
requests.exceptions.ReadTimeout,
GlassdoorException,
ValueError,
Exception,
) as e:
2025-02-21 10:29:28 -08:00
log.error(f"Glassdoor: {str(e)}")
return jobs, None
2023-10-30 17:57:36 -07:00
jobs_data = res_json["data"]["jobListings"]["jobListings"]
2024-01-22 18:22:32 -08:00
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {
executor.submit(self._process_job, job): job for job in jobs_data
}
2024-01-22 18:22:32 -08:00
for future in as_completed(future_to_job_data):
try:
job_post = future.result()
if job_post:
jobs.append(job_post)
except Exception as exc:
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
2023-10-30 17:57:36 -07:00
2025-02-21 12:14:55 -08:00
return jobs, get_cursor_for_page(
2023-10-30 17:57:36 -07:00
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
)
def _get_csrf_token(self):
"""
Fetches csrf token needed for API by visiting a generic page
"""
2024-10-19 16:01:59 -07:00
res = self.session.get(f"{self.base_url}/Job/computer-science-jobs.htm")
pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text)
token = None
if matches:
token = matches[0]
return token
2024-02-14 14:04:23 -08:00
def _process_job(self, job_data):
"""
Processes a single job and fetches its description.
"""
2024-01-22 18:22:32 -08:00
job_id = job_data["jobview"]["job"]["listingId"]
job_url = f"{self.base_url}job-listing/j?jl={job_id}"
2024-01-22 18:22:32 -08:00
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
job = job_data["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
company_id = job_data["jobview"]["header"]["employer"]["id"]
2024-01-22 18:22:32 -08:00
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_diff = (datetime.now() - timedelta(days=age_in_days)).date()
date_posted = date_diff if age_in_days is not None else None
2024-01-22 18:22:32 -08:00
if location_type == "S":
is_remote = True
else:
2025-02-21 12:14:55 -08:00
location = parse_location(location_name)
2024-01-22 18:22:32 -08:00
2025-02-21 12:14:55 -08:00
compensation = parse_compensation(job["header"])
2024-01-22 18:22:32 -08:00
try:
2024-02-14 14:04:23 -08:00
description = self._fetch_job_description(job_id)
2024-02-12 09:02:48 -08:00
except:
2024-01-22 18:22:32 -08:00
description = None
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
2024-07-15 18:30:04 -07:00
company_logo = (
job_data["jobview"].get("overview", {}).get("squareLogoUrl", None)
)
listing_type = (
job_data["jobview"]
.get("header", {})
.get("adOrderSponsorshipLevel", "")
.lower()
)
2024-02-14 14:04:23 -08:00
return JobPost(
2024-10-19 16:01:59 -07:00
id=f"gd-{job_id}",
2024-01-22 18:22:32 -08:00
title=title,
company_url=company_url if company_id else None,
2024-01-22 18:22:32 -08:00
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
2024-10-24 13:19:40 -07:00
company_logo=company_logo,
2024-07-15 18:30:04 -07:00
listing_type=listing_type,
2024-01-22 18:22:32 -08:00
)
2024-02-14 14:04:23 -08:00
def _fetch_job_description(self, job_id):
2023-10-30 17:57:36 -07:00
"""
2024-02-14 14:04:23 -08:00
Fetches the job description for a single job ID.
2023-10-30 17:57:36 -07:00
"""
2024-02-14 14:04:23 -08:00
url = f"{self.base_url}/graph"
2024-01-22 18:22:32 -08:00
body = [
{
"operationName": "JobDetailQuery",
"variables": {
"jl": job_id,
"queryString": "q",
"pageTypeEnum": "SERP",
2024-01-22 18:22:32 -08:00
},
"query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
jobview: jobView(
listingId: $jl
contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum}
) {
job {
description
__typename
}
__typename
}
}
""",
2024-01-22 18:22:32 -08:00
}
]
2024-10-19 16:01:59 -07:00
res = requests.post(url, json=body, headers=headers)
2024-02-14 14:04:23 -08:00
if res.status_code != 200:
2024-01-22 18:22:32 -08:00
return None
2024-02-14 14:04:23 -08:00
data = res.json()[0]
desc = data["data"]["jobview"]["job"]["description"]
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
desc = markdown_converter(desc)
return desc
2023-10-30 17:57:36 -07:00
2024-02-14 14:04:23 -08:00
def _get_location(self, location: str, is_remote: bool) -> (int, str):
2023-10-30 17:57:36 -07:00
if not location or is_remote:
2023-10-30 18:19:56 -07:00
return "11047", "STATE" # remote options
2024-02-14 14:04:23 -08:00
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
2024-10-19 16:01:59 -07:00
res = self.session.get(url)
2024-02-14 14:04:23 -08:00
if res.status_code != 200:
if res.status_code == 429:
err = f"429 Response - Blocked by Glassdoor for too many requests"
2025-02-21 10:29:28 -08:00
log.error(err)
2024-02-14 14:04:23 -08:00
return None, None
else:
err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
2025-02-21 10:29:28 -08:00
log.error(f"Glassdoor response status code {res.status_code}")
2024-02-14 14:04:23 -08:00
return None, None
items = res.json()
2023-10-30 17:57:36 -07:00
if not items:
raise ValueError(f"Location '{location}' not found on Glassdoor")
2023-10-30 18:19:56 -07:00
location_type = items[0]["locationType"]
if location_type == "C":
location_type = "CITY"
elif location_type == "S":
location_type = "STATE"
elif location_type == "N":
2024-02-03 05:20:53 -08:00
location_type = "COUNTRY"
2023-10-30 18:19:56 -07:00
return int(items[0]["locationId"]), location_type
2023-10-30 17:57:36 -07:00
2024-02-14 14:04:23 -08:00
def _add_payload(
self,
2023-10-30 17:57:36 -07:00
location_id: int,
location_type: str,
page_num: int,
cursor: str | None = None,
) -> str:
fromage = None
if self.scraper_input.hours_old:
fromage = max(self.scraper_input.hours_old // 24, 1)
filter_params = []
2024-02-14 14:04:23 -08:00
if self.scraper_input.easy_apply:
filter_params.append({"filterKey": "applicationType", "values": "1"})
if fromage:
filter_params.append({"filterKey": "fromAge", "values": str(fromage)})
2023-10-30 17:57:36 -07:00
payload = {
"operationName": "JobSearchResultsQuery",
"variables": {
"excludeJobListingIds": [],
"filterParams": filter_params,
2024-02-14 14:04:23 -08:00
"keyword": self.scraper_input.search_term,
2023-10-30 17:57:36 -07:00
"numJobsToShow": 30,
2023-10-30 18:19:56 -07:00
"locationType": location_type,
"locationId": int(location_id),
2023-10-30 17:57:36 -07:00
"parameterUrlInput": f"IL.0,12_I{location_type}{location_id}",
"pageNumber": page_num,
"pageCursor": cursor,
"fromage": fromage,
"sort": "date",
2023-10-30 17:57:36 -07:00
},
2024-10-19 16:01:59 -07:00
"query": query_template,
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])