diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 0e5deb1..f6b8eb9 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -31,6 +31,7 @@ def scrape_jobs( country_indeed: str = "usa", hyperlinks: bool = False, proxies: list[str] | str | None = None, + ca_cert: str | None = None, description_format: str = "markdown", linkedin_fetch_description: bool | None = False, linkedin_company_ids: list[int] | None = None, @@ -97,7 +98,7 @@ def scrape_jobs( def scrape_site(site: Site) -> Tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] - scraper = scraper_class(proxies=proxies) + scraper = scraper_class(proxies=proxies, ca_cert=ca_cert) scraped_data: JobResponse = scraper.scrape(scraper_input) cap_name = site.value.capitalize() site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 3f9ab51..8ca0539 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -42,9 +42,10 @@ class ScraperInput(BaseModel): class Scraper(ABC): - def __init__(self, site: Site, proxies: list[str] | None = None): - self.proxies = proxies + def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None): self.site = site + self.proxies = proxies + self.ca_cert = ca_cert @abstractmethod def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index a133153..d8666da 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -34,12 +34,12 @@ from ...jobs import ( class GlassdoorScraper(Scraper): - def __init__(self, proxies: list[str] | str | None = None): + def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): """ Initializes GlassdoorScraper with the Glassdoor job search url """ site = Site(Site.GLASSDOOR) - super().__init__(site, proxies=proxies) + super().__init__(site, proxies=proxies, ca_cert=ca_cert) self.base_url = None self.country = None @@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper): self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.base_url = self.scraper_input.country.get_glassdoor_url() - self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True) + self.session = create_session(proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True) token = self._get_csrf_token() self.headers["gd-csrf-token"] = token if token else self.fallback_token diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index f9d2bfc..b5ee1ef 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -32,13 +32,13 @@ from ...jobs import ( class IndeedScraper(Scraper): - def __init__(self, proxies: list[str] | str | None = None): + def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): """ Initializes IndeedScraper with the Indeed API url """ super().__init__(Site.INDEED, proxies=proxies) - self.session = create_session(proxies=self.proxies, is_tls=False) + self.session = create_session(proxies=self.proxies, ca_cert=ca_cert, is_tls=False) self.scraper_input = None self.jobs_per_page = 100 self.num_workers = 10 diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 36ddb92..6931b09 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -44,13 +44,14 @@ class LinkedInScraper(Scraper): band_delay = 4 jobs_per_page = 25 - def __init__(self, proxies: list[str] | str | None = None): + def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): """ Initializes LinkedInScraper with the LinkedIn job search url """ - super().__init__(Site.LINKEDIN, proxies=proxies) + super().__init__(Site.LINKEDIN, proxies=proxies, ca_cert=ca_cert) self.session = create_session( proxies=self.proxies, + ca_cert=ca_cert, is_tls=False, has_retry=True, delay=5, diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 5ebf2b6..8ccd404 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -100,6 +100,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session): def create_session( *, proxies: dict | str | None = None, + ca_cert: str | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1, @@ -119,6 +120,9 @@ def create_session( clear_cookies=clear_cookies, ) + if ca_cert: + session.verify = ca_cert + return session diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 6a061e5..b5c0221 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -41,14 +41,14 @@ class ZipRecruiterScraper(Scraper): base_url = "https://www.ziprecruiter.com" api_url = "https://api.ziprecruiter.com" - def __init__(self, proxies: list[str] | str | None = None): + def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): """ Initializes ZipRecruiterScraper with the ZipRecruiter job search url """ super().__init__(Site.ZIP_RECRUITER, proxies=proxies) self.scraper_input = None - self.session = create_session(proxies=proxies) + self.session = create_session(proxies=proxies, ca_cert=ca_cert) self._get_cookies() self.delay = 5