FEATURE: Add the "ca_cert" setting for providing a Certification Authority certificate in order to use proxies requiring it. (#204)

pull/209/head
Marcel Gozalbo Baró 2024-10-09 00:46:46 +02:00 committed by GitHub
parent 0cc34287f7
commit 6bc191d5c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 19 additions and 12 deletions

View File

@ -31,6 +31,7 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxies: list[str] | str | None = None, proxies: list[str] | str | None = None,
ca_cert: str | None = None,
description_format: str = "markdown", description_format: str = "markdown",
linkedin_fetch_description: bool | None = False, linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None, linkedin_company_ids: list[int] | None = None,
@ -97,7 +98,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies) scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name

View File

@ -42,9 +42,10 @@ class ScraperInput(BaseModel):
class Scraper(ABC): class Scraper(ABC):
def __init__(self, site: Site, proxies: list[str] | None = None): def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None):
self.proxies = proxies
self.site = site self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod @abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -34,12 +34,12 @@ from ...jobs import (
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
site = Site(Site.GLASSDOOR) site = Site(Site.GLASSDOOR)
super().__init__(site, proxies=proxies) super().__init__(site, proxies=proxies, ca_cert=ca_cert)
self.base_url = None self.base_url = None
self.country = None self.country = None
@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True) self.session = create_session(proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True)
token = self._get_csrf_token() token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token self.headers["gd-csrf-token"] = token if token else self.fallback_token

View File

@ -32,13 +32,13 @@ from ...jobs import (
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes IndeedScraper with the Indeed API url Initializes IndeedScraper with the Indeed API url
""" """
super().__init__(Site.INDEED, proxies=proxies) super().__init__(Site.INDEED, proxies=proxies)
self.session = create_session(proxies=self.proxies, is_tls=False) self.session = create_session(proxies=self.proxies, ca_cert=ca_cert, is_tls=False)
self.scraper_input = None self.scraper_input = None
self.jobs_per_page = 100 self.jobs_per_page = 100
self.num_workers = 10 self.num_workers = 10

View File

@ -44,13 +44,14 @@ class LinkedInScraper(Scraper):
band_delay = 4 band_delay = 4
jobs_per_page = 25 jobs_per_page = 25
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
super().__init__(Site.LINKEDIN, proxies=proxies) super().__init__(Site.LINKEDIN, proxies=proxies, ca_cert=ca_cert)
self.session = create_session( self.session = create_session(
proxies=self.proxies, proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False, is_tls=False,
has_retry=True, has_retry=True,
delay=5, delay=5,

View File

@ -100,6 +100,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
def create_session( def create_session(
*, *,
proxies: dict | str | None = None, proxies: dict | str | None = None,
ca_cert: str | None = None,
is_tls: bool = True, is_tls: bool = True,
has_retry: bool = False, has_retry: bool = False,
delay: int = 1, delay: int = 1,
@ -119,6 +120,9 @@ def create_session(
clear_cookies=clear_cookies, clear_cookies=clear_cookies,
) )
if ca_cert:
session.verify = ca_cert
return session return session

View File

@ -41,14 +41,14 @@ class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com" base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com" api_url = "https://api.ziprecruiter.com"
def __init__(self, proxies: list[str] | str | None = None): def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url
""" """
super().__init__(Site.ZIP_RECRUITER, proxies=proxies) super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
self.scraper_input = None self.scraper_input = None
self.session = create_session(proxies=proxies) self.session = create_session(proxies=proxies, ca_cert=ca_cert)
self._get_cookies() self._get_cookies()
self.delay = 5 self.delay = 5