mirror of https://github.com/Bunsly/JobSpy
reverted ziprecruiter
parent
25c084ca2c
commit
cd916c7978
|
@ -9,11 +9,10 @@ from datetime import datetime
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import cloudscraper # NEW: Use cloudscraper to bypass Cloudflare
|
||||
|
||||
from jobspy.ziprecruiter.constant import headers, get_cookie_data
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
markdown_converter,
|
||||
remove_attributes,
|
||||
create_logger,
|
||||
|
@ -42,20 +41,15 @@ class ZipRecruiter(Scraper):
|
|||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url.
|
||||
This version uses cloudscraper to bypass Cloudflare's anti-bot challenge.
|
||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||
"""
|
||||
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
||||
|
||||
# Use cloudscraper instead of the standard session to handle Cloudflare.
|
||||
self.session = cloudscraper.create_scraper()
|
||||
if proxies:
|
||||
self.session.proxies = proxies
|
||||
|
||||
self.scraper_input = None
|
||||
self.session = create_session(proxies=proxies, ca_cert=ca_cert)
|
||||
self.session.headers.update(headers)
|
||||
self._get_cookies()
|
||||
|
||||
self.scraper_input = None
|
||||
self.delay = 5
|
||||
self.jobs_per_page = 20
|
||||
self.seen_urls = set()
|
||||
|
@ -92,10 +86,10 @@ class ZipRecruiter(Scraper):
|
|||
self, scraper_input: ScraperInput, continue_token: str | None = None
|
||||
) -> tuple[list[JobPost], str | None]:
|
||||
"""
|
||||
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria.
|
||||
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
:param continue_token:
|
||||
:return: jobs found on page.
|
||||
:return: jobs found on page
|
||||
"""
|
||||
jobs_list = []
|
||||
params = add_params(scraper_input)
|
||||
|
@ -129,7 +123,7 @@ class ZipRecruiter(Scraper):
|
|||
|
||||
def _process_job(self, job: dict) -> JobPost | None:
|
||||
"""
|
||||
Processes an individual job dict from the response.
|
||||
Processes an individual job dict from the response
|
||||
"""
|
||||
title = job.get("name")
|
||||
job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
|
||||
|
@ -190,16 +184,16 @@ class ZipRecruiter(Scraper):
|
|||
job_descr_div = soup.find("div", class_="job_description")
|
||||
company_descr_section = soup.find("section", class_="company_description")
|
||||
job_description_clean = (
|
||||
remove_attributes(job_descr_div).get_text(separator="\n", strip=True)
|
||||
remove_attributes(job_descr_div).prettify(formatter="html")
|
||||
if job_descr_div
|
||||
else ""
|
||||
)
|
||||
company_description_clean = (
|
||||
remove_attributes(company_descr_section).get_text(separator="\n", strip=True)
|
||||
remove_attributes(company_descr_section).prettify(formatter="html")
|
||||
if company_descr_section
|
||||
else ""
|
||||
)
|
||||
description_full = job_description_clean + "\n" + company_description_clean
|
||||
description_full = job_description_clean + company_description_clean
|
||||
|
||||
try:
|
||||
script_tag = soup.find("script", type="application/json")
|
||||
|
@ -222,4 +216,4 @@ class ZipRecruiter(Scraper):
|
|||
Sends a session event to the API with device properties.
|
||||
"""
|
||||
url = f"{self.api_url}/jobs-app/event"
|
||||
self.session.post(url, data=get_cookie_data)
|
||||
self.session.post(url, data=get_cookie_data)
|
|
@ -28,4 +28,4 @@ def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
|||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
||||
return None
|
Loading…
Reference in New Issue