reverted ziprecruiter

pull/268/head
fakebranden 2025-03-12 00:16:09 +00:00
parent 25c084ca2c
commit cd916c7978
2 changed files with 12 additions and 18 deletions

View File

@ -9,11 +9,10 @@ from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import cloudscraper # NEW: Use cloudscraper to bypass Cloudflare
from jobspy.ziprecruiter.constant import headers, get_cookie_data from jobspy.ziprecruiter.constant import headers, get_cookie_data
from jobspy.util import ( from jobspy.util import (
extract_emails_from_text, extract_emails_from_text,
create_session,
markdown_converter, markdown_converter,
remove_attributes, remove_attributes,
create_logger, create_logger,
@ -42,20 +41,15 @@ class ZipRecruiter(Scraper):
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None
): ):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url. Initializes ZipRecruiterScraper with the ZipRecruiter job search url
This version uses cloudscraper to bypass Cloudflare's anti-bot challenge.
""" """
super().__init__(Site.ZIP_RECRUITER, proxies=proxies) super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
# Use cloudscraper instead of the standard session to handle Cloudflare. self.scraper_input = None
self.session = cloudscraper.create_scraper() self.session = create_session(proxies=proxies, ca_cert=ca_cert)
if proxies:
self.session.proxies = proxies
self.session.headers.update(headers) self.session.headers.update(headers)
self._get_cookies() self._get_cookies()
self.scraper_input = None
self.delay = 5 self.delay = 5
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
@ -92,10 +86,10 @@ class ZipRecruiter(Scraper):
self, scraper_input: ScraperInput, continue_token: str | None = None self, scraper_input: ScraperInput, continue_token: str | None = None
) -> tuple[list[JobPost], str | None]: ) -> tuple[list[JobPost], str | None]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria. Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:param continue_token: :param continue_token:
:return: jobs found on page. :return: jobs found on page
""" """
jobs_list = [] jobs_list = []
params = add_params(scraper_input) params = add_params(scraper_input)
@ -129,7 +123,7 @@ class ZipRecruiter(Scraper):
def _process_job(self, job: dict) -> JobPost | None: def _process_job(self, job: dict) -> JobPost | None:
""" """
Processes an individual job dict from the response. Processes an individual job dict from the response
""" """
title = job.get("name") title = job.get("name")
job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}" job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
@ -190,16 +184,16 @@ class ZipRecruiter(Scraper):
job_descr_div = soup.find("div", class_="job_description") job_descr_div = soup.find("div", class_="job_description")
company_descr_section = soup.find("section", class_="company_description") company_descr_section = soup.find("section", class_="company_description")
job_description_clean = ( job_description_clean = (
remove_attributes(job_descr_div).get_text(separator="\n", strip=True) remove_attributes(job_descr_div).prettify(formatter="html")
if job_descr_div if job_descr_div
else "" else ""
) )
company_description_clean = ( company_description_clean = (
remove_attributes(company_descr_section).get_text(separator="\n", strip=True) remove_attributes(company_descr_section).prettify(formatter="html")
if company_descr_section if company_descr_section
else "" else ""
) )
description_full = job_description_clean + "\n" + company_description_clean description_full = job_description_clean + company_description_clean
try: try:
script_tag = soup.find("script", type="application/json") script_tag = soup.find("script", type="application/json")
@ -222,4 +216,4 @@ class ZipRecruiter(Scraper):
Sends a session event to the API with device properties. Sends a session event to the API with device properties.
""" """
url = f"{self.api_url}/jobs-app/event" url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=get_cookie_data) self.session.post(url, data=get_cookie_data)

View File

@ -28,4 +28,4 @@ def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
if job_type_str in job_type.value: if job_type_str in job_type.value:
return [job_type] return [job_type]
return None return None