mirror of https://github.com/Bunsly/JobSpy
FEATURE: Add the "ca_cert" setting for providing a Certification Authority certificate in order to use proxies requiring it. (#204)
parent
0cc34287f7
commit
6bc191d5c7
|
@ -31,6 +31,7 @@ def scrape_jobs(
|
||||||
country_indeed: str = "usa",
|
country_indeed: str = "usa",
|
||||||
hyperlinks: bool = False,
|
hyperlinks: bool = False,
|
||||||
proxies: list[str] | str | None = None,
|
proxies: list[str] | str | None = None,
|
||||||
|
ca_cert: str | None = None,
|
||||||
description_format: str = "markdown",
|
description_format: str = "markdown",
|
||||||
linkedin_fetch_description: bool | None = False,
|
linkedin_fetch_description: bool | None = False,
|
||||||
linkedin_company_ids: list[int] | None = None,
|
linkedin_company_ids: list[int] | None = None,
|
||||||
|
@ -97,7 +98,7 @@ def scrape_jobs(
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class(proxies=proxies)
|
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
cap_name = site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
|
|
|
@ -42,9 +42,10 @@ class ScraperInput(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class Scraper(ABC):
|
class Scraper(ABC):
|
||||||
def __init__(self, site: Site, proxies: list[str] | None = None):
|
def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None):
|
||||||
self.proxies = proxies
|
|
||||||
self.site = site
|
self.site = site
|
||||||
|
self.proxies = proxies
|
||||||
|
self.ca_cert = ca_cert
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||||
|
|
|
@ -34,12 +34,12 @@ from ...jobs import (
|
||||||
|
|
||||||
|
|
||||||
class GlassdoorScraper(Scraper):
|
class GlassdoorScraper(Scraper):
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.GLASSDOOR)
|
site = Site(Site.GLASSDOOR)
|
||||||
super().__init__(site, proxies=proxies)
|
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
||||||
|
|
||||||
self.base_url = None
|
self.base_url = None
|
||||||
self.country = None
|
self.country = None
|
||||||
|
@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
|
||||||
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||||
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
||||||
|
|
||||||
self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True)
|
self.session = create_session(proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
||||||
|
|
||||||
|
|
|
@ -32,13 +32,13 @@ from ...jobs import (
|
||||||
|
|
||||||
|
|
||||||
class IndeedScraper(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes IndeedScraper with the Indeed API url
|
Initializes IndeedScraper with the Indeed API url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.INDEED, proxies=proxies)
|
super().__init__(Site.INDEED, proxies=proxies)
|
||||||
|
|
||||||
self.session = create_session(proxies=self.proxies, is_tls=False)
|
self.session = create_session(proxies=self.proxies, ca_cert=ca_cert, is_tls=False)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.jobs_per_page = 100
|
self.jobs_per_page = 100
|
||||||
self.num_workers = 10
|
self.num_workers = 10
|
||||||
|
|
|
@ -44,13 +44,14 @@ class LinkedInScraper(Scraper):
|
||||||
band_delay = 4
|
band_delay = 4
|
||||||
jobs_per_page = 25
|
jobs_per_page = 25
|
||||||
|
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes LinkedInScraper with the LinkedIn job search url
|
Initializes LinkedInScraper with the LinkedIn job search url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.LINKEDIN, proxies=proxies)
|
super().__init__(Site.LINKEDIN, proxies=proxies, ca_cert=ca_cert)
|
||||||
self.session = create_session(
|
self.session = create_session(
|
||||||
proxies=self.proxies,
|
proxies=self.proxies,
|
||||||
|
ca_cert=ca_cert,
|
||||||
is_tls=False,
|
is_tls=False,
|
||||||
has_retry=True,
|
has_retry=True,
|
||||||
delay=5,
|
delay=5,
|
||||||
|
|
|
@ -100,6 +100,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
|
||||||
def create_session(
|
def create_session(
|
||||||
*,
|
*,
|
||||||
proxies: dict | str | None = None,
|
proxies: dict | str | None = None,
|
||||||
|
ca_cert: str | None = None,
|
||||||
is_tls: bool = True,
|
is_tls: bool = True,
|
||||||
has_retry: bool = False,
|
has_retry: bool = False,
|
||||||
delay: int = 1,
|
delay: int = 1,
|
||||||
|
@ -119,6 +120,9 @@ def create_session(
|
||||||
clear_cookies=clear_cookies,
|
clear_cookies=clear_cookies,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if ca_cert:
|
||||||
|
session.verify = ca_cert
|
||||||
|
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -41,14 +41,14 @@ class ZipRecruiterScraper(Scraper):
|
||||||
base_url = "https://www.ziprecruiter.com"
|
base_url = "https://www.ziprecruiter.com"
|
||||||
api_url = "https://api.ziprecruiter.com"
|
api_url = "https://api.ziprecruiter.com"
|
||||||
|
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
||||||
|
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.session = create_session(proxies=proxies)
|
self.session = create_session(proxies=proxies, ca_cert=ca_cert)
|
||||||
self._get_cookies()
|
self._get_cookies()
|
||||||
|
|
||||||
self.delay = 5
|
self.delay = 5
|
||||||
|
|
Loading…
Reference in New Issue