reverted ziprecruiter

pull/268/head
fakebranden 2025-03-12 00:16:09 +00:00
parent 25c084ca2c
commit cd916c7978
2 changed files with 12 additions and 18 deletions

View File

@ -9,11 +9,10 @@ from datetime import datetime
from bs4 import BeautifulSoup
import cloudscraper # NEW: Use cloudscraper to bypass Cloudflare
from jobspy.ziprecruiter.constant import headers, get_cookie_data
from jobspy.util import (
extract_emails_from_text,
create_session,
markdown_converter,
remove_attributes,
create_logger,
@ -42,20 +41,15 @@ class ZipRecruiter(Scraper):
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes ZipRecruiterScraper with the ZipRecruiter job search url.
This version uses cloudscraper to bypass Cloudflare's anti-bot challenge.
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
"""
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
# Use cloudscraper instead of the standard session to handle Cloudflare.
self.session = cloudscraper.create_scraper()
if proxies:
self.session.proxies = proxies
self.scraper_input = None
self.session = create_session(proxies=proxies, ca_cert=ca_cert)
self.session.headers.update(headers)
self._get_cookies()
self.scraper_input = None
self.delay = 5
self.jobs_per_page = 20
self.seen_urls = set()
@ -92,10 +86,10 @@ class ZipRecruiter(Scraper):
self, scraper_input: ScraperInput, continue_token: str | None = None
) -> tuple[list[JobPost], str | None]:
"""
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria.
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:param continue_token:
:return: jobs found on page.
:return: jobs found on page
"""
jobs_list = []
params = add_params(scraper_input)
@ -129,7 +123,7 @@ class ZipRecruiter(Scraper):
def _process_job(self, job: dict) -> JobPost | None:
"""
Processes an individual job dict from the response.
Processes an individual job dict from the response
"""
title = job.get("name")
job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
@ -190,16 +184,16 @@ class ZipRecruiter(Scraper):
job_descr_div = soup.find("div", class_="job_description")
company_descr_section = soup.find("section", class_="company_description")
job_description_clean = (
remove_attributes(job_descr_div).get_text(separator="\n", strip=True)
remove_attributes(job_descr_div).prettify(formatter="html")
if job_descr_div
else ""
)
company_description_clean = (
remove_attributes(company_descr_section).get_text(separator="\n", strip=True)
remove_attributes(company_descr_section).prettify(formatter="html")
if company_descr_section
else ""
)
description_full = job_description_clean + "\n" + company_description_clean
description_full = job_description_clean + company_description_clean
try:
script_tag = soup.find("script", type="application/json")
@ -222,4 +216,4 @@ class ZipRecruiter(Scraper):
Sends a session event to the API with device properties.
"""
url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=get_cookie_data)
self.session.post(url, data=get_cookie_data)

View File

@ -28,4 +28,4 @@ def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
return None