fix(glassdoor): add retry adapter (#77)

pull/79/head^2
Cullen Watson 2024-01-03 12:04:32 -06:00 committed by GitHub
parent 33d442bf1e
commit a5916edcdd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 8 deletions

View File

@ -26,7 +26,7 @@ class GlassdoorScraper(Scraper):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
"""
site = Site(Site.ZIP_RECRUITER)
site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy)
self.url = None
@ -49,7 +49,7 @@ class GlassdoorScraper(Scraper):
payload = self.add_payload(
scraper_input, location_id, location_type, page_num, cursor
)
session = create_session(self.proxy, is_tls=False)
session = create_session(self.proxy, is_tls=False, has_retry=True)
response = session.post(
f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload
)
@ -171,7 +171,7 @@ class GlassdoorScraper(Scraper):
if not location or is_remote:
return "11047", "STATE" # remote options
url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy)
session = create_session(self.proxy, has_retry=True)
response = session.get(url)
if response.status_code != 200:
raise GlassdoorException(
@ -194,7 +194,7 @@ class GlassdoorScraper(Scraper):
location_type: str,
page_num: int,
cursor: str | None = None,
) -> dict[str, str | Any]:
) -> str:
payload = {
"operationName": "JobSearchResultsQuery",
"variables": {

View File

@ -1,8 +1,10 @@
import re
import numpy as np
import requests
import tls_client
import requests
from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType
@ -27,11 +29,11 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)
def create_session(proxy: dict | None = None, is_tls: bool = True):
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False):
"""
Creates a tls client session
Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object with or without proxies.
:return: A session object
"""
if is_tls:
session = tls_client.Session(
@ -44,6 +46,16 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
session.allow_redirects = True
if proxy:
session.proxies.update(proxy)
if has_retry:
retries = Retry(total=3,
connect=3,
status=3,
status_forcelist=[500, 502, 503, 504, 429],
backoff_factor=1)
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session