From 9b7e12d08ce7b552342f444b923c9b3c82ffffbd Mon Sep 17 00:00:00 2001 From: letrongtai902 Date: Sun, 4 May 2025 01:04:56 +0700 Subject: [PATCH] issue#270: glassdoor 403 response by rotating user-agent and updating headers --- jobspy/__init__.py | 3 ++- jobspy/bayt/__init__.py | 2 +- jobspy/glassdoor/__init__.py | 6 ++++-- jobspy/google/__init__.py | 2 +- jobspy/indeed/__init__.py | 2 +- jobspy/linkedin/__init__.py | 2 +- jobspy/model.py | 3 ++- jobspy/naukri/__init__.py | 2 +- jobspy/ziprecruiter/__init__.py | 2 +- 9 files changed, 14 insertions(+), 10 deletions(-) diff --git a/jobspy/__init__.py b/jobspy/__init__.py index 7ec88e5..b7a8c6c 100644 --- a/jobspy/__init__.py +++ b/jobspy/__init__.py @@ -45,6 +45,7 @@ def scrape_jobs( hours_old: int = None, enforce_annual_salary: bool = False, verbose: int = 0, + user_agent: str = None, **kwargs, ) -> pd.DataFrame: """ @@ -98,7 +99,7 @@ def scrape_jobs( def scrape_site(site: Site) -> Tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] - scraper = scraper_class(proxies=proxies, ca_cert=ca_cert) + scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent) scraped_data: JobResponse = scraper.scrape(scraper_input) cap_name = site.value.capitalize() site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name diff --git a/jobspy/bayt/__init__.py b/jobspy/bayt/__init__.py index 0fd29e9..14d14c0 100644 --- a/jobspy/bayt/__init__.py +++ b/jobspy/bayt/__init__.py @@ -25,7 +25,7 @@ class BaytScraper(Scraper): band_delay = 3 def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None ): super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert) self.scraper_input = None diff --git a/jobspy/glassdoor/__init__.py b/jobspy/glassdoor/__init__.py index 225d7fd..8de7915 100644 --- a/jobspy/glassdoor/__init__.py +++ b/jobspy/glassdoor/__init__.py @@ -34,13 +34,13 @@ log = create_logger("Glassdoor") class Glassdoor(Scraper): def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None ): """ Initializes GlassdoorScraper with the Glassdoor job search url """ site = Site(Site.GLASSDOOR) - super().__init__(site, proxies=proxies, ca_cert=ca_cert) + super().__init__(site, proxies=proxies, ca_cert=ca_cert, user_agent=user_agent) self.base_url = None self.country = None @@ -65,6 +65,8 @@ class Glassdoor(Scraper): ) token = self._get_csrf_token() headers["gd-csrf-token"] = token if token else fallback_token + if self.user_agent: + headers["user-agent"] = self.user_agent self.session.headers.update(headers) location_id, location_type = self._get_location( diff --git a/jobspy/google/__init__.py b/jobspy/google/__init__.py index e77903c..acc90aa 100644 --- a/jobspy/google/__init__.py +++ b/jobspy/google/__init__.py @@ -22,7 +22,7 @@ from jobspy.google.util import log, find_job_info_initial_page, find_job_info class Google(Scraper): def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None ): """ Initializes Google Scraper with the Goodle jobs search url diff --git a/jobspy/indeed/__init__.py b/jobspy/indeed/__init__.py index adbc9e9..667accb 100644 --- a/jobspy/indeed/__init__.py +++ b/jobspy/indeed/__init__.py @@ -28,7 +28,7 @@ log = create_logger("Indeed") class Indeed(Scraper): def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None ): """ Initializes IndeedScraper with the Indeed API url diff --git a/jobspy/linkedin/__init__.py b/jobspy/linkedin/__init__.py index c8bca93..a164732 100644 --- a/jobspy/linkedin/__init__.py +++ b/jobspy/linkedin/__init__.py @@ -50,7 +50,7 @@ class LinkedIn(Scraper): jobs_per_page = 25 def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None ): """ Initializes LinkedInScraper with the LinkedIn job search url diff --git a/jobspy/model.py b/jobspy/model.py index f9155b1..c5372ae 100644 --- a/jobspy/model.py +++ b/jobspy/model.py @@ -320,11 +320,12 @@ class ScraperInput(BaseModel): class Scraper(ABC): def __init__( - self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None + self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None ): self.site = site self.proxies = proxies self.ca_cert = ca_cert + self.user_agent = user_agent @abstractmethod def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/jobspy/naukri/__init__.py b/jobspy/naukri/__init__.py index 0a97bf0..d456c85 100644 --- a/jobspy/naukri/__init__.py +++ b/jobspy/naukri/__init__.py @@ -44,7 +44,7 @@ class Naukri(Scraper): jobs_per_page = 20 def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None ): """ Initializes NaukriScraper with the Naukri API URL diff --git a/jobspy/ziprecruiter/__init__.py b/jobspy/ziprecruiter/__init__.py index c91ba57..c8bb54e 100644 --- a/jobspy/ziprecruiter/__init__.py +++ b/jobspy/ziprecruiter/__init__.py @@ -38,7 +38,7 @@ class ZipRecruiter(Scraper): api_url = "https://api.ziprecruiter.com" def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None ): """ Initializes ZipRecruiterScraper with the ZipRecruiter job search url