From a5916edcddf833419898dcdc30180e91fcdad947 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 3 Jan 2024 12:04:32 -0600 Subject: [PATCH] fix(glassdoor): add retry adapter (#77) --- src/jobspy/scrapers/glassdoor/__init__.py | 8 ++++---- src/jobspy/scrapers/utils.py | 20 ++++++++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 57fff5f..706b3e7 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -26,7 +26,7 @@ class GlassdoorScraper(Scraper): """ Initializes GlassdoorScraper with the Glassdoor job search url """ - site = Site(Site.ZIP_RECRUITER) + site = Site(Site.GLASSDOOR) super().__init__(site, proxy=proxy) self.url = None @@ -49,7 +49,7 @@ class GlassdoorScraper(Scraper): payload = self.add_payload( scraper_input, location_id, location_type, page_num, cursor ) - session = create_session(self.proxy, is_tls=False) + session = create_session(self.proxy, is_tls=False, has_retry=True) response = session.post( f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload ) @@ -171,7 +171,7 @@ class GlassdoorScraper(Scraper): if not location or is_remote: return "11047", "STATE" # remote options url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" - session = create_session(self.proxy) + session = create_session(self.proxy, has_retry=True) response = session.get(url) if response.status_code != 200: raise GlassdoorException( @@ -194,7 +194,7 @@ class GlassdoorScraper(Scraper): location_type: str, page_num: int, cursor: str | None = None, - ) -> dict[str, str | Any]: + ) -> str: payload = { "operationName": "JobSearchResultsQuery", "variables": { diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 6ca635b..9b38c0e 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,8 +1,10 @@ import re import numpy as np -import requests import tls_client +import requests +from requests.adapters import HTTPAdapter, Retry + from ..jobs import JobType @@ -27,11 +29,11 @@ def extract_emails_from_text(text: str) -> list[str] | None: return email_regex.findall(text) -def create_session(proxy: dict | None = None, is_tls: bool = True): +def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False): """ - Creates a tls client session + Creates a requests session with optional tls, proxy, and retry settings. - :return: A session object with or without proxies. + :return: A session object """ if is_tls: session = tls_client.Session( @@ -44,6 +46,16 @@ def create_session(proxy: dict | None = None, is_tls: bool = True): session.allow_redirects = True if proxy: session.proxies.update(proxy) + if has_retry: + retries = Retry(total=3, + connect=3, + status=3, + status_forcelist=[500, 502, 503, 504, 429], + backoff_factor=1) + adapter = HTTPAdapter(max_retries=retries) + + session.mount('http://', adapter) + session.mount('https://', adapter) return session