log search updates, fix glassdoor (#120)

pull/122/head
troy-conte 2024-03-04 17:39:38 -05:00 committed by GitHub
parent f8a4eccc6b
commit db01bc6bbb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 133 additions and 124 deletions

View File

@ -167,8 +167,4 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
- Waiting some time between scrapes (site-dependent). - Waiting some time between scrapes (site-dependent).
- Trying a VPN or proxy to change your IP address. - Trying a VPN or proxy to change your IP address.
--- ---

View File

@ -145,7 +145,7 @@ class Country(Enum):
else: else:
raise Exception(f"Glassdoor is not available for {self.name}") raise Exception(f"Glassdoor is not available for {self.name}")
def get_url(self): def get_glassdoor_url(self):
return f"https://{self.glassdoor_domain_value}/" return f"https://{self.glassdoor_domain_value}/"
@classmethod @classmethod

View File

@ -5,6 +5,8 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
import json import json
import re
import requests import requests
from typing import Optional from typing import Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -42,6 +44,7 @@ class GlassdoorScraper(Scraper):
self.session = None self.session = None
self.scraper_input = None self.scraper_input = None
self.jobs_per_page = 30 self.jobs_per_page = 30
self.max_pages = 30
self.seen_urls = set() self.seen_urls = set()
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
@ -52,39 +55,40 @@ class GlassdoorScraper(Scraper):
""" """
self.scraper_input = scraper_input self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
token = self._get_csrf_token()
self.headers['gd-csrf-token'] = token if token else self.fallback_token
location_id, location_type = self._get_location( location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote scraper_input.location, scraper_input.is_remote
) )
if location_type is None: if location_type is None:
logger.error('Glassdoor: location not parsed')
return JobResponse(jobs=[]) return JobResponse(jobs=[])
all_jobs: list[JobPost] = [] all_jobs: list[JobPost] = []
cursor = None cursor = None
max_pages = 30
self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.base_url)
try: for page in range(
for page in range( 1 + (scraper_input.offset // self.jobs_per_page),
1 + (scraper_input.offset // self.jobs_per_page), min(
min( (scraper_input.results_wanted // self.jobs_per_page) + 2,
(scraper_input.results_wanted // self.jobs_per_page) + 2, self.max_pages + 1,
max_pages + 1, ),
), ):
): logger.info(f'Glassdoor search page: {page}')
try: try:
jobs, cursor = self._fetch_jobs_page( jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor scraper_input, location_id, location_type, page, cursor
) )
all_jobs.extend(jobs) all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted: if not jobs or len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted] all_jobs = all_jobs[: scraper_input.results_wanted]
break break
except Exception as e: except Exception as e:
raise GlassdoorException(str(e)) logger.error(f'Glassdoor: {str(e)}')
except Exception as e: break
raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs) return JobResponse(jobs=all_jobs)
def _fetch_jobs_page( def _fetch_jobs_page(
@ -98,27 +102,26 @@ class GlassdoorScraper(Scraper):
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
jobs = []
self.scraper_input = scraper_input self.scraper_input = scraper_input
try: try:
payload = self._add_payload( payload = self._add_payload(
location_id, location_type, page_num, cursor location_id, location_type, page_num, cursor
) )
response = self.session.post( response = self.session.post(
f"{self.base_url}/graph", headers=self.headers, timeout=10, data=payload f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload
) )
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException( raise GlassdoorException(f"bad response status code: {response.status_code}")
f"bad response status code: {response.status_code}"
)
res_json = response.json()[0] res_json = response.json()[0]
if "errors" in res_json: if "errors" in res_json:
raise ValueError("Error encountered in API response") raise ValueError("Error encountered in API response")
except Exception as e: except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e:
raise GlassdoorException(str(e)) logger.error(f'Glassdoor: {str(e)}')
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs_data = res_json["data"]["jobListings"]["jobListings"]
jobs = []
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data} future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
for future in as_completed(future_to_job_data): for future in as_completed(future_to_job_data):
@ -133,6 +136,18 @@ class GlassdoorScraper(Scraper):
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
def _get_csrf_token(self):
"""
Fetches csrf token needed for API by visiting a generic page
"""
res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers)
pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text)
token = None
if matches:
token = matches[0]
return token
def _process_job(self, job_data): def _process_job(self, job_data):
""" """
Processes a single job and fetches its description. Processes a single job and fetches its description.
@ -217,7 +232,7 @@ class GlassdoorScraper(Scraper):
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True) session = create_session(self.proxy, has_retry=True)
res = session.get(url) res = self.session.get(url, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:
logger.error(f'429 Response - Blocked by Glassdoor for too many requests') logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
@ -266,7 +281,74 @@ class GlassdoorScraper(Scraper):
"fromage": fromage, "fromage": fromage,
"sort": "date" "sort": "date"
}, },
"query": """ "query": self.query_template
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
query_template = """
query JobSearchResultsQuery( query JobSearchResultsQuery(
$excludeJobListingIds: [Long!], $excludeJobListingIds: [Long!],
$keyword: String, $keyword: String,
@ -431,70 +513,4 @@ class GlassdoorScraper(Scraper):
} }
__typename __typename
} }
""" """
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}

View File

@ -15,7 +15,6 @@ from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
from ..utils import ( from ..utils import (
count_urgent_words, count_urgent_words,
extract_emails_from_text, extract_emails_from_text,
@ -63,8 +62,7 @@ class IndeedScraper(Scraper):
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page) pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page)
new_jobs = False new_jobs = False
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self._scrape_page, page + pages_processed) executor.submit(self._scrape_page, page + pages_processed)
for page in range(pages_to_process) for page in range(pages_to_process)
@ -93,10 +91,11 @@ class IndeedScraper(Scraper):
:param page: :param page:
:return: jobs found on page, total number of jobs found for search :return: jobs found on page, total number of jobs found for search
""" """
logger.info(f'Indeed search page: {page + 1}')
job_list = [] job_list = []
domain = self.scraper_input.country.indeed_domain_value domain = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com" self.base_url = f"https://{domain}.indeed.com"
try: try:
session = create_session(self.proxy) session = create_session(self.proxy)
response = session.get( response = session.get(
@ -141,7 +140,6 @@ class IndeedScraper(Scraper):
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed) executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
return job_list return job_list

View File

@ -9,8 +9,6 @@ import random
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
import requests
from requests.exceptions import ProxyError
from threading import Lock from threading import Lock
from bs4.element import Tag from bs4.element import Tag
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -41,15 +39,16 @@ from ..utils import (
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
base_url = "https://www.linkedin.com" base_url = "https://www.linkedin.com"
delay = 3 delay = 3
band_delay = 4
jobs_per_page = 25
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
self.scraper_input = None self.scraper_input = None
site = Site(Site.LINKEDIN)
self.country = "worldwide" self.country = "worldwide"
super().__init__(site, proxy=proxy)
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -68,8 +67,8 @@ class LinkedInScraper(Scraper):
else None else None
) )
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000 continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search(): while continue_search():
logger.info(f'LinkedIn search page: {page // 25 + 1}')
session = create_session(is_tls=False, has_retry=True, delay=5) session = create_session(is_tls=False, has_retry=True, delay=5)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
@ -83,8 +82,9 @@ class LinkedInScraper(Scraper):
"start": page + scraper_input.offset, "start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None, "f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
"f_TPR": f"r{seconds_old}",
} }
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
try: try:
@ -101,13 +101,13 @@ class LinkedInScraper(Scraper):
logger.error(f'429 Response - Blocked by LinkedIn for too many requests') logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
else: else:
logger.error(f'LinkedIn response status code {response.status_code}') logger.error(f'LinkedIn response status code {response.status_code}')
return JobResponse(job_list=job_list) return JobResponse(jobs=job_list)
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f'LinkedIn: Bad proxy') logger.error(f'LinkedIn: Bad proxy')
else: else:
logger.error(f'LinkedIn: {str(e)}') logger.error(f'LinkedIn: {str(e)}')
return JobResponse(job_list=job_list) return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card") job_cards = soup.find_all("div", class_="base-search-card")
@ -136,8 +136,8 @@ class LinkedInScraper(Scraper):
raise LinkedInException(str(e)) raise LinkedInException(str(e))
if continue_search(): if continue_search():
time.sleep(random.uniform(self.delay, self.delay + 2)) time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += 25 page += self.jobs_per_page
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)

View File

@ -13,9 +13,8 @@ text_maker = html2text.HTML2Text()
logger = logging.getLogger("JobSpy") logger = logging.getLogger("JobSpy")
logger.propagate = False logger.propagate = False
if not logger.handlers: if not logger.handlers:
logger.setLevel(logging.ERROR) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter) console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)

View File

@ -63,7 +63,7 @@ class ZipRecruiterScraper(Scraper):
break break
if page > 1: if page > 1:
time.sleep(self.delay) time.sleep(self.delay)
logger.info(f'ZipRecruiter search page: {page}')
jobs_on_page, continue_token = self._find_jobs_in_page( jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token scraper_input, continue_token
) )