format: jobspy/scrapers/ziprecruiter

pull/127/head
VitaminB16 2024-03-09 19:26:39 +00:00
parent 2e421ff4e2
commit 289f626e79
1 changed files with 35 additions and 34 deletions

View File

@ -4,6 +4,7 @@ jobspy.scrapers.ziprecruiter
This module contains routines to scrape ZipRecruiter. This module contains routines to scrape ZipRecruiter.
""" """
from __future__ import annotations from __future__ import annotations
import math import math
@ -18,7 +19,7 @@ from ..utils import (
logger, logger,
extract_emails_from_text, extract_emails_from_text,
create_session, create_session,
markdown_converter markdown_converter,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -27,7 +28,7 @@ from ...jobs import (
JobResponse, JobResponse,
JobType, JobType,
Country, Country,
DescriptionFormat DescriptionFormat,
) )
@ -64,7 +65,7 @@ class ZipRecruiterScraper(Scraper):
break break
if page > 1: if page > 1:
time.sleep(self.delay) time.sleep(self.delay)
logger.info(f'ZipRecruiter search page: {page}') logger.info(f"ZipRecruiter search page: {page}")
jobs_on_page, continue_token = self._find_jobs_in_page( jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token scraper_input, continue_token
) )
@ -90,25 +91,24 @@ class ZipRecruiterScraper(Scraper):
if continue_token: if continue_token:
params["continue_from"] = continue_token params["continue_from"] = continue_token
try: try:
res= self.session.get( res = self.session.get(
f"{self.api_url}/jobs-app/jobs", f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params
headers=self.headers,
params=params
) )
if res.status_code not in range(200, 400): if res.status_code not in range(200, 400):
if res.status_code == 429: if res.status_code == 429:
logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests') err = "429 Response - Blocked by ZipRecruiter for too many requests"
else: else:
logger.error(f'ZipRecruiter response status code {res.status_code}') err = f"ZipRecruiter response status code {res.status_code}"
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
logger.error(err)
return jobs_list, "" return jobs_list, ""
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy') logger.error(f"Indeed: Bad proxy")
else: else:
logger.error(f'Indeed: {str(e)}') logger.error(f"Indeed: {str(e)}")
return jobs_list, "" return jobs_list, ""
res_data = res.json() res_data = res.json()
jobs_list = res_data.get("jobs", []) jobs_list = res_data.get("jobs", [])
next_continue_token = res_data.get("continue", None) next_continue_token = res_data.get("continue", None)
@ -129,7 +129,11 @@ class ZipRecruiterScraper(Scraper):
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
description = job.get("job_description", "").strip() description = job.get("job_description", "").strip()
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description description = (
markdown_converter(description)
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
else description
)
company = job.get("hiring_company", {}).get("name") company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada" country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value) country_enum = Country.from_string(country_value)
@ -140,23 +144,22 @@ class ZipRecruiterScraper(Scraper):
job_type = self._get_job_type_enum( job_type = self._get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower() job.get("employment_type", "").replace("_", "").lower()
) )
date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date() date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
comp_interval = job.get("compensation_interval")
comp_interval = "yearly" if comp_interval == "annual" else comp_interval
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
comp_currency = job.get("compensation_currency")
return JobPost( return JobPost(
title=title, title=title,
company_name=company, company_name=company,
location=location, location=location,
job_type=job_type, job_type=job_type,
compensation=Compensation( compensation=Compensation(
interval="yearly" interval=comp_interval,
if job.get("compensation_interval") == "annual" min_amount=comp_min,
else job.get("compensation_interval"), max_amount=comp_max,
min_amount=int(job["compensation_min"]) currency=comp_currency,
if "compensation_min" in job
else None,
max_amount=int(job["compensation_max"])
if "compensation_max" in job
else None,
currency=job.get("compensation_currency"),
), ),
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
@ -165,8 +168,9 @@ class ZipRecruiterScraper(Scraper):
) )
def _get_cookies(self): def _get_cookies(self):
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers) url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data, headers=self.headers)
@staticmethod @staticmethod
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None: def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
@ -182,16 +186,13 @@ class ZipRecruiterScraper(Scraper):
"location": scraper_input.location, "location": scraper_input.location,
} }
if scraper_input.hours_old: if scraper_input.hours_old:
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None params["days"] = max(scraper_input.hours_old // 24, 1)
params['days'] = fromage job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
job_type_map = {
JobType.FULL_TIME: 'full_time',
JobType.PART_TIME: 'part_time'
}
if scraper_input.job_type: if scraper_input.job_type:
params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0] job_type = scraper_input.job_type
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply: if scraper_input.easy_apply:
params['zipapply'] = 1 params["zipapply"] = 1
if scraper_input.is_remote: if scraper_input.is_remote:
params["remote"] = 1 params["remote"] = 1
if scraper_input.distance: if scraper_input.distance: