pull/274/merge
Lê Trọng Tài 2025-05-04 01:09:55 +07:00 committed by GitHub
commit a6ad371a3d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 14 additions and 10 deletions

View File

@ -45,6 +45,7 @@ def scrape_jobs(
hours_old: int = None, hours_old: int = None,
enforce_annual_salary: bool = False, enforce_annual_salary: bool = False,
verbose: int = 0, verbose: int = 0,
user_agent: str = None,
**kwargs, **kwargs,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
@ -98,7 +99,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert) scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name

View File

@ -25,7 +25,7 @@ class BaytScraper(Scraper):
band_delay = 3 band_delay = 3
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert) super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None self.scraper_input = None

View File

@ -34,13 +34,13 @@ log = create_logger("Glassdoor")
class Glassdoor(Scraper): class Glassdoor(Scraper):
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
site = Site(Site.GLASSDOOR) site = Site(Site.GLASSDOOR)
super().__init__(site, proxies=proxies, ca_cert=ca_cert) super().__init__(site, proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
self.base_url = None self.base_url = None
self.country = None self.country = None
@ -65,6 +65,8 @@ class Glassdoor(Scraper):
) )
token = self._get_csrf_token() token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token headers["gd-csrf-token"] = token if token else fallback_token
if self.user_agent:
headers["user-agent"] = self.user_agent
self.session.headers.update(headers) self.session.headers.update(headers)
location_id, location_type = self._get_location( location_id, location_type = self._get_location(

View File

@ -22,7 +22,7 @@ from jobspy.google.util import log, find_job_info_initial_page, find_job_info
class Google(Scraper): class Google(Scraper):
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes Google Scraper with the Goodle jobs search url Initializes Google Scraper with the Goodle jobs search url

View File

@ -28,7 +28,7 @@ log = create_logger("Indeed")
class Indeed(Scraper): class Indeed(Scraper):
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes IndeedScraper with the Indeed API url Initializes IndeedScraper with the Indeed API url

View File

@ -50,7 +50,7 @@ class LinkedIn(Scraper):
jobs_per_page = 25 jobs_per_page = 25
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url

View File

@ -320,11 +320,12 @@ class ScraperInput(BaseModel):
class Scraper(ABC): class Scraper(ABC):
def __init__( def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
self.site = site self.site = site
self.proxies = proxies self.proxies = proxies
self.ca_cert = ca_cert self.ca_cert = ca_cert
self.user_agent = user_agent
@abstractmethod @abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -44,7 +44,7 @@ class Naukri(Scraper):
jobs_per_page = 20 jobs_per_page = 20
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes NaukriScraper with the Naukri API URL Initializes NaukriScraper with the Naukri API URL

View File

@ -38,7 +38,7 @@ class ZipRecruiter(Scraper):
api_url = "https://api.ziprecruiter.com" api_url = "https://api.ziprecruiter.com"
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url