mirror of https://github.com/Bunsly/JobSpy
format: jobspy/scrapers/glassdoor
parent
63790da29f
commit
444f65ebbc
|
@ -4,23 +4,23 @@ jobspy.scrapers.glassdoor
|
||||||
|
|
||||||
This module contains routines to scrape Glassdoor.
|
This module contains routines to scrape Glassdoor.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
import requests
|
import requests
|
||||||
from typing import Optional
|
from typing import Optional, Tuple
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from ..utils import extract_emails_from_text
|
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
|
from ..utils import extract_emails_from_text
|
||||||
from ..exceptions import GlassdoorException
|
from ..exceptions import GlassdoorException
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
logger
|
logger,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
@ -29,7 +29,7 @@ from ...jobs import (
|
||||||
Location,
|
Location,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
JobType,
|
JobType,
|
||||||
DescriptionFormat
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,25 +61,22 @@ class GlassdoorScraper(Scraper):
|
||||||
|
|
||||||
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
|
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
self.headers['gd-csrf-token'] = token if token else self.fallback_token
|
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
||||||
|
|
||||||
location_id, location_type = self._get_location(
|
location_id, location_type = self._get_location(
|
||||||
scraper_input.location, scraper_input.is_remote
|
scraper_input.location, scraper_input.is_remote
|
||||||
)
|
)
|
||||||
if location_type is None:
|
if location_type is None:
|
||||||
logger.error('Glassdoor: location not parsed')
|
logger.error("Glassdoor: location not parsed")
|
||||||
return JobResponse(jobs=[])
|
return JobResponse(jobs=[])
|
||||||
all_jobs: list[JobPost] = []
|
all_jobs: list[JobPost] = []
|
||||||
cursor = None
|
cursor = None
|
||||||
|
|
||||||
for page in range(
|
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
||||||
1 + (scraper_input.offset // self.jobs_per_page),
|
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||||
min(
|
range_end = min(tot_pages, self.max_pages + 1)
|
||||||
(scraper_input.results_wanted // self.jobs_per_page) + 2,
|
for page in range(range_start, range_end):
|
||||||
self.max_pages + 1,
|
logger.info(f"Glassdoor search page: {page}")
|
||||||
),
|
|
||||||
):
|
|
||||||
logger.info(f'Glassdoor search page: {page}')
|
|
||||||
try:
|
try:
|
||||||
jobs, cursor = self._fetch_jobs_page(
|
jobs, cursor = self._fetch_jobs_page(
|
||||||
scraper_input, location_id, location_type, page, cursor
|
scraper_input, location_id, location_type, page, cursor
|
||||||
|
@ -89,7 +86,7 @@ class GlassdoorScraper(Scraper):
|
||||||
all_jobs = all_jobs[: scraper_input.results_wanted]
|
all_jobs = all_jobs[: scraper_input.results_wanted]
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f'Glassdoor: {str(e)}')
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
break
|
break
|
||||||
return JobResponse(jobs=all_jobs)
|
return JobResponse(jobs=all_jobs)
|
||||||
|
|
||||||
|
@ -100,39 +97,48 @@ class GlassdoorScraper(Scraper):
|
||||||
location_type: str,
|
location_type: str,
|
||||||
page_num: int,
|
page_num: int,
|
||||||
cursor: str | None,
|
cursor: str | None,
|
||||||
) -> (list[JobPost], str | None):
|
) -> Tuple[list[JobPost], str | None]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of Glassdoor for jobs with scraper_input criteria
|
Scrapes a page of Glassdoor for jobs with scraper_input criteria
|
||||||
"""
|
"""
|
||||||
jobs = []
|
jobs = []
|
||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
try:
|
try:
|
||||||
payload = self._add_payload(
|
payload = self._add_payload(location_id, location_type, page_num, cursor)
|
||||||
location_id, location_type, page_num, cursor
|
|
||||||
)
|
|
||||||
response = self.session.post(
|
response = self.session.post(
|
||||||
f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload
|
f"{self.base_url}/graph",
|
||||||
|
headers=self.headers,
|
||||||
|
timeout_seconds=15,
|
||||||
|
data=payload,
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
raise GlassdoorException(f"bad response status code: {response.status_code}")
|
exc_msg = f"bad response status code: {response.status_code}"
|
||||||
|
raise GlassdoorException(exc_msg)
|
||||||
res_json = response.json()[0]
|
res_json = response.json()[0]
|
||||||
if "errors" in res_json:
|
if "errors" in res_json:
|
||||||
raise ValueError("Error encountered in API response")
|
raise ValueError("Error encountered in API response")
|
||||||
except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e:
|
except (
|
||||||
logger.error(f'Glassdoor: {str(e)}')
|
requests.exceptions.ReadTimeout,
|
||||||
|
GlassdoorException,
|
||||||
|
ValueError,
|
||||||
|
Exception,
|
||||||
|
) as e:
|
||||||
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
return jobs, None
|
return jobs, None
|
||||||
|
|
||||||
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||||
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
|
future_to_job_data = {
|
||||||
|
executor.submit(self._process_job, job): job for job in jobs_data
|
||||||
|
}
|
||||||
for future in as_completed(future_to_job_data):
|
for future in as_completed(future_to_job_data):
|
||||||
try:
|
try:
|
||||||
job_post = future.result()
|
job_post = future.result()
|
||||||
if job_post:
|
if job_post:
|
||||||
jobs.append(job_post)
|
jobs.append(job_post)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
|
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
|
||||||
|
|
||||||
return jobs, self.get_cursor_for_page(
|
return jobs, self.get_cursor_for_page(
|
||||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||||
|
@ -142,7 +148,9 @@ class GlassdoorScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
Fetches csrf token needed for API by visiting a generic page
|
Fetches csrf token needed for API by visiting a generic page
|
||||||
"""
|
"""
|
||||||
res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers)
|
res = self.session.get(
|
||||||
|
f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
|
||||||
|
)
|
||||||
pattern = r'"token":\s*"([^"]+)"'
|
pattern = r'"token":\s*"([^"]+)"'
|
||||||
matches = re.findall(pattern, res.text)
|
matches = re.findall(pattern, res.text)
|
||||||
token = None
|
token = None
|
||||||
|
@ -155,19 +163,20 @@ class GlassdoorScraper(Scraper):
|
||||||
Processes a single job and fetches its description.
|
Processes a single job and fetches its description.
|
||||||
"""
|
"""
|
||||||
job_id = job_data["jobview"]["job"]["listingId"]
|
job_id = job_data["jobview"]["job"]["listingId"]
|
||||||
job_url = f'{self.base_url}job-listing/j?jl={job_id}'
|
job_url = f"{self.base_url}job-listing/j?jl={job_id}"
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
return None
|
return None
|
||||||
self.seen_urls.add(job_url)
|
self.seen_urls.add(job_url)
|
||||||
job = job_data["jobview"]
|
job = job_data["jobview"]
|
||||||
title = job["job"]["jobTitleText"]
|
title = job["job"]["jobTitleText"]
|
||||||
company_name = job["header"]["employerNameFromSearch"]
|
company_name = job["header"]["employerNameFromSearch"]
|
||||||
company_id = job_data['jobview']['header']['employer']['id']
|
company_id = job_data["jobview"]["header"]["employer"]["id"]
|
||||||
location_name = job["header"].get("locationName", "")
|
location_name = job["header"].get("locationName", "")
|
||||||
location_type = job["header"].get("locationType", "")
|
location_type = job["header"].get("locationType", "")
|
||||||
age_in_days = job["header"].get("ageInDays")
|
age_in_days = job["header"].get("ageInDays")
|
||||||
is_remote, location = False, None
|
is_remote, location = False, None
|
||||||
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None
|
date_diff = (datetime.now() - timedelta(days=age_in_days)).date()
|
||||||
|
date_posted = date_diff if age_in_days is not None else None
|
||||||
|
|
||||||
if location_type == "S":
|
if location_type == "S":
|
||||||
is_remote = True
|
is_remote = True
|
||||||
|
@ -179,9 +188,10 @@ class GlassdoorScraper(Scraper):
|
||||||
description = self._fetch_job_description(job_id)
|
description = self._fetch_job_description(job_id)
|
||||||
except:
|
except:
|
||||||
description = None
|
description = None
|
||||||
|
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
|
company_url=company_url if company_id else None,
|
||||||
company_name=company_name,
|
company_name=company_name,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
|
@ -203,7 +213,7 @@ class GlassdoorScraper(Scraper):
|
||||||
"variables": {
|
"variables": {
|
||||||
"jl": job_id,
|
"jl": job_id,
|
||||||
"queryString": "q",
|
"queryString": "q",
|
||||||
"pageTypeEnum": "SERP"
|
"pageTypeEnum": "SERP",
|
||||||
},
|
},
|
||||||
"query": """
|
"query": """
|
||||||
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
|
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
|
||||||
|
@ -218,15 +228,17 @@ class GlassdoorScraper(Scraper):
|
||||||
__typename
|
__typename
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"""
|
""",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
res = requests.post(url, json=body, headers=self.headers)
|
res = requests.post(url, json=body, headers=self.headers)
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
return None
|
return None
|
||||||
data = res.json()[0]
|
data = res.json()[0]
|
||||||
desc = data['data']['jobview']['job']['description']
|
desc = data["data"]["jobview"]["job"]["description"]
|
||||||
return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
desc = markdown_converter(desc)
|
||||||
|
return desc
|
||||||
|
|
||||||
def _get_location(self, location: str, is_remote: bool) -> (int, str):
|
def _get_location(self, location: str, is_remote: bool) -> (int, str):
|
||||||
if not location or is_remote:
|
if not location or is_remote:
|
||||||
|
@ -236,10 +248,13 @@ class GlassdoorScraper(Scraper):
|
||||||
res = self.session.get(url, headers=self.headers)
|
res = self.session.get(url, headers=self.headers)
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
|
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
||||||
|
logger.error(err)
|
||||||
return None, None
|
return None, None
|
||||||
else:
|
else:
|
||||||
logger.error(f'Glassdoor response status code {res.status_code}')
|
err = f"Glassdoor response status code {res.status_code}"
|
||||||
|
err += f" - {res.text}"
|
||||||
|
logger.error(f"Glassdoor response status code {res.status_code}")
|
||||||
return None, None
|
return None, None
|
||||||
items = res.json()
|
items = res.json()
|
||||||
|
|
||||||
|
@ -250,7 +265,7 @@ class GlassdoorScraper(Scraper):
|
||||||
location_type = "CITY"
|
location_type = "CITY"
|
||||||
elif location_type == "S":
|
elif location_type == "S":
|
||||||
location_type = "STATE"
|
location_type = "STATE"
|
||||||
elif location_type == 'N':
|
elif location_type == "N":
|
||||||
location_type = "COUNTRY"
|
location_type = "COUNTRY"
|
||||||
return int(items[0]["locationId"]), location_type
|
return int(items[0]["locationId"]), location_type
|
||||||
|
|
||||||
|
@ -261,7 +276,9 @@ class GlassdoorScraper(Scraper):
|
||||||
page_num: int,
|
page_num: int,
|
||||||
cursor: str | None = None,
|
cursor: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
|
fromage = None
|
||||||
|
if self.scraper_input.hours_old:
|
||||||
|
fromage = max(self.scraper_input.hours_old // 24, 1)
|
||||||
filter_params = []
|
filter_params = []
|
||||||
if self.scraper_input.easy_apply:
|
if self.scraper_input.easy_apply:
|
||||||
filter_params.append({"filterKey": "applicationType", "values": "1"})
|
filter_params.append({"filterKey": "applicationType", "values": "1"})
|
||||||
|
@ -280,9 +297,9 @@ class GlassdoorScraper(Scraper):
|
||||||
"pageNumber": page_num,
|
"pageNumber": page_num,
|
||||||
"pageCursor": cursor,
|
"pageCursor": cursor,
|
||||||
"fromage": fromage,
|
"fromage": fromage,
|
||||||
"sort": "date"
|
"sort": "date",
|
||||||
},
|
},
|
||||||
"query": self.query_template
|
"query": self.query_template,
|
||||||
}
|
}
|
||||||
if self.scraper_input.job_type:
|
if self.scraper_input.job_type:
|
||||||
payload["variables"]["filterParams"].append(
|
payload["variables"]["filterParams"].append(
|
||||||
|
|
Loading…
Reference in New Issue