mirror of https://github.com/Bunsly/JobSpy
issue#270: glassdoor 403 response by rotating user-agent and updating headers
parent
94d413bad1
commit
9b7e12d08c
|
@ -45,6 +45,7 @@ def scrape_jobs(
|
||||||
hours_old: int = None,
|
hours_old: int = None,
|
||||||
enforce_annual_salary: bool = False,
|
enforce_annual_salary: bool = False,
|
||||||
verbose: int = 0,
|
verbose: int = 0,
|
||||||
|
user_agent: str = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
@ -98,7 +99,7 @@ def scrape_jobs(
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
|
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
cap_name = site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
|
|
|
@ -25,7 +25,7 @@ class BaytScraper(Scraper):
|
||||||
band_delay = 3
|
band_delay = 3
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
|
|
|
@ -34,13 +34,13 @@ log = create_logger("Glassdoor")
|
||||||
|
|
||||||
class Glassdoor(Scraper):
|
class Glassdoor(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.GLASSDOOR)
|
site = Site(Site.GLASSDOOR)
|
||||||
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
super().__init__(site, proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
|
||||||
|
|
||||||
self.base_url = None
|
self.base_url = None
|
||||||
self.country = None
|
self.country = None
|
||||||
|
@ -65,6 +65,8 @@ class Glassdoor(Scraper):
|
||||||
)
|
)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
headers["gd-csrf-token"] = token if token else fallback_token
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
|
if self.user_agent:
|
||||||
|
headers["user-agent"] = self.user_agent
|
||||||
self.session.headers.update(headers)
|
self.session.headers.update(headers)
|
||||||
|
|
||||||
location_id, location_type = self._get_location(
|
location_id, location_type = self._get_location(
|
||||||
|
|
|
@ -22,7 +22,7 @@ from jobspy.google.util import log, find_job_info_initial_page, find_job_info
|
||||||
|
|
||||||
class Google(Scraper):
|
class Google(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes Google Scraper with the Goodle jobs search url
|
Initializes Google Scraper with the Goodle jobs search url
|
||||||
|
|
|
@ -28,7 +28,7 @@ log = create_logger("Indeed")
|
||||||
|
|
||||||
class Indeed(Scraper):
|
class Indeed(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes IndeedScraper with the Indeed API url
|
Initializes IndeedScraper with the Indeed API url
|
||||||
|
|
|
@ -50,7 +50,7 @@ class LinkedIn(Scraper):
|
||||||
jobs_per_page = 25
|
jobs_per_page = 25
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes LinkedInScraper with the LinkedIn job search url
|
Initializes LinkedInScraper with the LinkedIn job search url
|
||||||
|
|
|
@ -320,11 +320,12 @@ class ScraperInput(BaseModel):
|
||||||
|
|
||||||
class Scraper(ABC):
|
class Scraper(ABC):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
self.site = site
|
self.site = site
|
||||||
self.proxies = proxies
|
self.proxies = proxies
|
||||||
self.ca_cert = ca_cert
|
self.ca_cert = ca_cert
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||||
|
|
|
@ -44,7 +44,7 @@ class Naukri(Scraper):
|
||||||
jobs_per_page = 20
|
jobs_per_page = 20
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes NaukriScraper with the Naukri API URL
|
Initializes NaukriScraper with the Naukri API URL
|
||||||
|
|
|
@ -38,7 +38,7 @@ class ZipRecruiter(Scraper):
|
||||||
api_url = "https://api.ziprecruiter.com"
|
api_url = "https://api.ziprecruiter.com"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||||
|
|
Loading…
Reference in New Issue