Compare commits

...

5 Commits

Author SHA1 Message Date
Cullen Watson
a7ad616567 fix: linkedin no results (#80) 2024-01-10 14:01:10 -06:00
cullenwatson
53bc33a43a chore: version 2024-01-09 19:33:56 -06:00
Cullen Watson
22870438c7 linkedin fix delays (#79) 2024-01-09 19:32:51 -06:00
Cullen Watson
aeb93b99f5 Update pyproject.toml 2024-01-03 12:04:50 -06:00
Cullen Watson
a5916edcdd fix(glassdoor): add retry adapter (#77) 2024-01-03 12:04:32 -06:00
4 changed files with 73 additions and 56 deletions

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.30" version = "1.1.33"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -26,7 +26,7 @@ class GlassdoorScraper(Scraper):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
site = Site(Site.ZIP_RECRUITER) site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.url = None self.url = None
@@ -49,7 +49,7 @@ class GlassdoorScraper(Scraper):
payload = self.add_payload( payload = self.add_payload(
scraper_input, location_id, location_type, page_num, cursor scraper_input, location_id, location_type, page_num, cursor
) )
session = create_session(self.proxy, is_tls=False) session = create_session(self.proxy, is_tls=False, has_retry=True)
response = session.post( response = session.post(
f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload
) )
@@ -171,7 +171,7 @@ class GlassdoorScraper(Scraper):
if not location or is_remote: if not location or is_remote:
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy) session = create_session(self.proxy, has_retry=True)
response = session.get(url) response = session.get(url)
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException( raise GlassdoorException(
@@ -194,7 +194,7 @@ class GlassdoorScraper(Scraper):
location_type: str, location_type: str,
page_num: int, page_num: int,
cursor: str | None = None, cursor: str | None = None,
) -> dict[str, str | Any]: ) -> str:
payload = { payload = {
"operationName": "JobSearchResultsQuery", "operationName": "JobSearchResultsQuery",
"variables": { "variables": {

View File

@@ -4,6 +4,7 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn. This module contains routines to scrape LinkedIn.
""" """
import random
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
@@ -16,14 +17,14 @@ from threading import Lock
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ..utils import create_session
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
MAX_RETRIES = 3 DELAY = 3
DELAY = 10
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
@@ -57,6 +58,7 @@ class LinkedInScraper(Scraper):
return mapping.get(job_type_enum, "") return mapping.get(job_type_enum, "")
while len(job_list) < scraper_input.results_wanted and page < 1000: while len(job_list) < scraper_input.results_wanted and page < 1000:
session = create_session(is_tls=False, has_retry=True, delay=5)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@@ -71,44 +73,30 @@ class LinkedInScraper(Scraper):
} }
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
retries = 0 try:
while retries < self.MAX_RETRIES: response = session.get(
try: f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
response = requests.get( params=params,
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", allow_redirects=True,
params=params, proxies=self.proxy,
allow_redirects=True, headers=self.headers(),
proxies=self.proxy, timeout=10,
timeout=10,
)
response.raise_for_status()
break
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
retries += 1
continue
else:
raise LinkedInException(
f"bad response status code: {e.response.status_code}"
)
else:
raise
except ProxyError as e:
raise LinkedInException("bad proxy")
except Exception as e:
raise LinkedInException(str(e))
else:
# Raise an exception if the maximum number of retries is reached
raise LinkedInException(
"Max retries reached, failed to get a valid response"
) )
response.raise_for_status()
except requests.HTTPError as e:
raise LinkedInException(f"bad response status code: {e.response.status_code}")
except ProxyError as e:
raise LinkedInException("bad proxy")
except Exception as e:
raise LinkedInException(str(e))
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card")
if len(job_cards) == 0:
return JobResponse(jobs=job_list)
for job_card in soup.find_all("div", class_="base-search-card"): for job_card in job_cards:
job_url = None job_url = None
href_tag = job_card.find("a", class_="base-card__full-link") href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs: if href_tag and "href" in href_tag.attrs:
@@ -130,6 +118,7 @@ class LinkedInScraper(Scraper):
raise LinkedInException("Exception occurred while processing jobs") raise LinkedInException("Exception occurred while processing jobs")
page += 25 page += 25
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
@@ -181,22 +170,22 @@ class LinkedInScraper(Scraper):
benefits_tag = job_card.find("span", class_="result-benefits__text") benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
description, job_type = self.get_job_description(job_url) # removed to speed up scraping
# description, job_type = None, [] # description, job_type = self.get_job_description(job_url)
return JobPost( return JobPost(
title=title, title=title,
description=description,
company_name=company, company_name=company,
company_url=company_url, company_url=company_url,
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_type=job_type,
compensation=compensation, compensation=compensation,
benefits=benefits, benefits=benefits,
emails=extract_emails_from_text(description) if description else None, # job_type=job_type,
num_urgent_words=count_urgent_words(description) if description else None, # description=description,
# emails=extract_emails_from_text(description) if description else None,
# num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_job_description( def get_job_description(
@@ -208,12 +197,10 @@ class LinkedInScraper(Scraper):
:return: description or None :return: description or None
""" """
try: try:
response = requests.get(job_page_url, timeout=5, proxies=self.proxy) session = create_session(is_tls=False, has_retry=True)
response = session.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status() response.raise_for_status()
except requests.HTTPError as e: except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
return None, None return None, None
except Exception as e: except Exception as e:
return None, None return None, None
@@ -287,3 +274,21 @@ class LinkedInScraper(Scraper):
) )
return location return location
@staticmethod
def headers() -> dict:
return {
'authority': 'www.linkedin.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"macOS"',
# 'sec-fetch-dest': 'document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site': 'none',
# 'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

View File

@@ -1,8 +1,10 @@
import re import re
import numpy as np import numpy as np
import requests
import tls_client import tls_client
import requests
from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType from ..jobs import JobType
@@ -27,11 +29,11 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text) return email_regex.findall(text)
def create_session(proxy: dict | None = None, is_tls: bool = True): def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
""" """
Creates a tls client session Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object with or without proxies. :return: A session object
""" """
if is_tls: if is_tls:
session = tls_client.Session( session = tls_client.Session(
@@ -44,6 +46,16 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
session.allow_redirects = True session.allow_redirects = True
if proxy: if proxy:
session.proxies.update(proxy) session.proxies.update(proxy)
if has_retry:
retries = Retry(total=3,
connect=3,
status=3,
status_forcelist=[500, 502, 503, 504, 429],
backoff_factor=delay)
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session return session