fix: glassdoor

pull/120/head
Cullen Watson 2024-03-04 16:31:02 -06:00
parent e99ea38a5f
commit 649eab0ca7
5 changed files with 132 additions and 139 deletions

View File

@ -145,7 +145,7 @@ class Country(Enum):
else: else:
raise Exception(f"Glassdoor is not available for {self.name}") raise Exception(f"Glassdoor is not available for {self.name}")
def get_url(self): def get_glassdoor_url(self):
return f"https://{self.glassdoor_domain_value}/" return f"https://{self.glassdoor_domain_value}/"
@classmethod @classmethod

View File

@ -5,6 +5,8 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
import json import json
import re
import requests import requests
from typing import Optional from typing import Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -28,6 +30,7 @@ from ...jobs import (
DescriptionFormat DescriptionFormat
) )
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
@ -41,6 +44,7 @@ class GlassdoorScraper(Scraper):
self.session = None self.session = None
self.scraper_input = None self.scraper_input = None
self.jobs_per_page = 30 self.jobs_per_page = 30
self.max_pages = 30
self.seen_urls = set() self.seen_urls = set()
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
@ -51,54 +55,40 @@ class GlassdoorScraper(Scraper):
""" """
self.scraper_input = scraper_input self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
token = self._get_csrf_token()
self.headers['gd-csrf-token'] = token if token else self.fallback_token
location_id, location_type = self._get_location( location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote scraper_input.location, scraper_input.is_remote
) )
if location_type is None: if location_type is None:
logger.error('Glassdoor: location not parsed')
return JobResponse(jobs=[]) return JobResponse(jobs=[])
all_jobs: list[JobPost] = [] all_jobs: list[JobPost] = []
cursor = None cursor = None
max_pages = 30
urlCount = 0
self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.base_url)
try:
print(f'Glassdoor searches: {urlCount}')
urlCount += 1
for page in range( for page in range(
1 + (scraper_input.offset // self.jobs_per_page), 1 + (scraper_input.offset // self.jobs_per_page),
min( min(
(scraper_input.results_wanted // self.jobs_per_page) + 2, (scraper_input.results_wanted // self.jobs_per_page) + 2,
max_pages + 1, self.max_pages + 1,
), ),
): ):
logger.info(f'Glassdoor search page: {page}')
try: try:
jobs, cursor = self._fetch_jobs_page( jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor scraper_input, location_id, location_type, page, cursor
) )
all_jobs.extend(jobs) all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted: if not jobs or len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted] all_jobs = all_jobs[: scraper_input.results_wanted]
break break
except requests.exceptions.ReadTimeout as timeout_exception:
logger.error(f"Timeout occurred on page {page}: {str(timeout_exception)}")
# Skip this page and continue to the next
continue
except Exception as e: except Exception as e:
raise GlassdoorException(str(e)) logger.error(f'Glassdoor: {str(e)}')
except requests.exceptions.ReadTimeout as timeout_exception: break
logger.error(f"Timeout occurred on page {page}: {str(timeout_exception)}")
except requests.exceptions.HTTPError as http_error:
if http_error.response.status_code == 502:
logger.error(f"Bad Gateway (502) encountered: {str(http_error)}")
# Decide on a retry mechanism, log the error, or take another appropriate action
else:
raise # Re-raise the exception if it's not a 502 error
except Exception as e:
raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs) return JobResponse(jobs=all_jobs)
def _fetch_jobs_page( def _fetch_jobs_page(
@ -112,29 +102,26 @@ class GlassdoorScraper(Scraper):
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
jobs = []
self.scraper_input = scraper_input self.scraper_input = scraper_input
try: try:
payload = self._add_payload( payload = self._add_payload(
location_id, location_type, page_num, cursor location_id, location_type, page_num, cursor
) )
response = self.session.post( response = self.session.post(
f"{self.base_url}/graph", headers=self.headers, timeout=10, data=payload f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload
) )
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException( raise GlassdoorException(f"bad response status code: {response.status_code}")
f"bad response status code: {response.status_code}"
)
res_json = response.json()[0] res_json = response.json()[0]
if "errors" in res_json: if "errors" in res_json:
raise ValueError("Error encountered in API response") raise ValueError("Error encountered in API response")
except Exception as e: except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e:
raise GlassdoorException(str(e)) logger.error(f'Glassdoor: {str(e)}')
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs_data = res_json["data"]["jobListings"]["jobListings"]
jobs = []
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data} future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
for future in as_completed(future_to_job_data): for future in as_completed(future_to_job_data):
@ -149,6 +136,18 @@ class GlassdoorScraper(Scraper):
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
def _get_csrf_token(self):
"""
Fetches csrf token needed for API by visiting a generic page
"""
res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers)
pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text)
token = None
if matches:
token = matches[0]
return token
def _process_job(self, job_data): def _process_job(self, job_data):
""" """
Processes a single job and fetches its description. Processes a single job and fetches its description.
@ -233,7 +232,7 @@ class GlassdoorScraper(Scraper):
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True) session = create_session(self.proxy, has_retry=True)
res = session.get(url) res = self.session.get(url, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:
logger.error(f'429 Response - Blocked by Glassdoor for too many requests') logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
@ -282,7 +281,74 @@ class GlassdoorScraper(Scraper):
"fromage": fromage, "fromage": fromage,
"sort": "date" "sort": "date"
}, },
"query": """ "query": self.query_template
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
query_template = """
query JobSearchResultsQuery( query JobSearchResultsQuery(
$excludeJobListingIds: [Long!], $excludeJobListingIds: [Long!],
$keyword: String, $keyword: String,
@ -448,69 +514,3 @@ class GlassdoorScraper(Scraper):
__typename __typename
} }
""" """
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}

View File

@ -15,7 +15,6 @@ from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
from ..utils import ( from ..utils import (
count_urgent_words, count_urgent_words,
extract_emails_from_text, extract_emails_from_text,
@ -59,14 +58,11 @@ class IndeedScraper(Scraper):
self.scraper_input = scraper_input self.scraper_input = scraper_input
job_list = self._scrape_page() job_list = self._scrape_page()
pages_processed = 1 pages_processed = 1
urlCount = 0
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page) pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page)
new_jobs = False new_jobs = False
print(f'Indeed search page: {urlCount}') with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
urlCount += 1
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self._scrape_page, page + pages_processed) executor.submit(self._scrape_page, page + pages_processed)
for page in range(pages_to_process) for page in range(pages_to_process)
@ -90,6 +86,7 @@ class IndeedScraper(Scraper):
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def _scrape_page(self, page: int=0) -> list[JobPost]: def _scrape_page(self, page: int=0) -> list[JobPost]:
logger.info(f'Indeed search page: {page + 1}')
""" """
Scrapes a page of Indeed for jobs with scraper_input criteria Scrapes a page of Indeed for jobs with scraper_input criteria
:param page: :param page:
@ -143,7 +140,6 @@ class IndeedScraper(Scraper):
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed) executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
return job_list return job_list

View File

@ -9,8 +9,6 @@ import random
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
import requests
from requests.exceptions import ProxyError
from threading import Lock from threading import Lock
from bs4.element import Tag from bs4.element import Tag
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -41,15 +39,16 @@ from ..utils import (
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
base_url = "https://www.linkedin.com" base_url = "https://www.linkedin.com"
delay = 3 delay = 3
band_delay = 4
jobs_per_page = 25
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
self.scraper_input = None self.scraper_input = None
site = Site(Site.LINKEDIN)
self.country = "worldwide" self.country = "worldwide"
super().__init__(site, proxy=proxy)
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -68,10 +67,8 @@ class LinkedInScraper(Scraper):
else None else None
) )
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000 continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
urlCount = 0
while continue_search(): while continue_search():
print(f'LinkedIn searches: {urlCount}') logger.info(f'LinkedIn search page: {page // 25 + 1}')
urlCount += 1
session = create_session(is_tls=False, has_retry=True, delay=5) session = create_session(is_tls=False, has_retry=True, delay=5)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
@ -85,8 +82,9 @@ class LinkedInScraper(Scraper):
"start": page + scraper_input.offset, "start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None, "f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
"f_TPR": f"r{seconds_old}",
} }
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
try: try:
@ -103,13 +101,13 @@ class LinkedInScraper(Scraper):
logger.error(f'429 Response - Blocked by LinkedIn for too many requests') logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
else: else:
logger.error(f'LinkedIn response status code {response.status_code}') logger.error(f'LinkedIn response status code {response.status_code}')
return JobResponse(job_list=job_list) return JobResponse(jobs=job_list)
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f'LinkedIn: Bad proxy') logger.error(f'LinkedIn: Bad proxy')
else: else:
logger.error(f'LinkedIn: {str(e)}') logger.error(f'LinkedIn: {str(e)}')
return JobResponse(job_list=job_list) return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card") job_cards = soup.find_all("div", class_="base-search-card")
@ -138,8 +136,8 @@ class LinkedInScraper(Scraper):
raise LinkedInException(str(e)) raise LinkedInException(str(e))
if continue_search(): if continue_search():
time.sleep(random.uniform(self.delay, self.delay + 7)) time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += 25 page += self.jobs_per_page
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)

View File

@ -13,9 +13,8 @@ text_maker = html2text.HTML2Text()
logger = logging.getLogger("JobSpy") logger = logging.getLogger("JobSpy")
logger.propagate = False logger.propagate = False
if not logger.handlers: if not logger.handlers:
logger.setLevel(logging.ERROR) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter) console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)