Compare commits

..

2 Commits

Author SHA1 Message Date
Cullen
78c1ec8e9f [fix] add compensation 2023-10-28 16:13:10 -05:00
Cullen
a2dd93aca1 [enh] use ziprecuriter api 2023-10-28 15:50:28 -05:00
4 changed files with 137 additions and 35 deletions

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.23"
version = "1.1.16"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -58,6 +58,7 @@ class IndeedScraper(Scraper):
self.country = scraper_input.country
domain = self.country.domain_value
self.url = f"https://{domain}.indeed.com"
session = create_session(self.proxy)
params = {
"q": scraper_input.search_term,
@@ -77,7 +78,6 @@ class IndeedScraper(Scraper):
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
try:
session = create_session(self.proxy, is_tls=True)
response = session.get(
f"{self.url}/jobs",
headers=self.get_headers(),

View File

@@ -1,6 +1,4 @@
import re
import requests
import tls_client
from ..jobs import JobType
@@ -26,29 +24,23 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)
def create_session(proxy: dict | None = None, is_tls: bool = True):
def create_session(proxy: str | None = None):
"""
Creates a tls client session
:return: A session object with or without proxies.
"""
if is_tls:
session = tls_client.Session(
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True
if proxy:
session.proxies.update(proxy)
session = tls_client.Session(
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
return session

View File

@@ -5,18 +5,29 @@ jobspy.scrapers.ziprecruiter
This module contains routines to scrape ZipRecruiter.
"""
import math
import time
import json
import re
from datetime import datetime, date
from typing import Optional, Tuple, Any
from urllib.parse import urlparse, parse_qs, urlunparse
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType
from ...jobs import (
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
Country,
)
class ZipRecruiterScraper(Scraper):
@@ -31,22 +42,21 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20
self.seen_urls = set()
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]:
"""
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:param continue_token:
:return: jobs found on page
"""
params = self.add_params(scraper_input)
if continue_token:
params['continue'] = continue_token
try:
session = create_session(self.proxy, is_tls=False)
response = session.get(
response = requests.get(
f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(),
params=self.add_params(scraper_input),
allow_redirects=True,
timeout=10,
)
if response.status_code != 200:
@@ -58,12 +68,11 @@ class ZipRecruiterScraper(Scraper):
raise ZipRecruiterException("bad proxy")
raise ZipRecruiterException(str(e))
time.sleep(5)
response_data = response.json()
jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get('continue', None)
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
with ThreadPoolExecutor(max_workers=10) as executor:
job_results = [
executor.submit(self.process_job, job)
for job in jobs_list
@@ -99,9 +108,8 @@ class ZipRecruiterScraper(Scraper):
return JobResponse(jobs=job_list)
@staticmethod
def process_job(job: dict) -> JobPost:
""" Processes an individual job dict from the response """
def process_job(self, job: dict) -> JobPost:
"""the most common type of jobs page on ZR"""
title = job.get("name")
job_url = job.get("job_url")
@@ -128,13 +136,14 @@ class ZipRecruiterScraper(Scraper):
else:
date_posted = date.today()
return JobPost(
title=title,
company_name=company,
location=location,
job_type=job_type,
compensation=Compensation(
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval") ,
min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
currency=job.get("compensation_currency"),
@@ -182,6 +191,107 @@ class ZipRecruiterScraper(Scraper):
return params
@staticmethod
def get_interval(interval_str: str):
"""
Maps the interval alias to its appropriate CompensationInterval.
:param interval_str
:return: CompensationInterval
"""
interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower()
if interval_str in interval_alias:
return interval_alias[interval_str]
return CompensationInterval(interval_str)
@staticmethod
def get_date_posted(job: Tag) -> Optional[datetime.date]:
"""
Extracts the date a job was posted
:param job
:return: date the job was posted or None
"""
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
if not button:
return None
url_time = button.get("data-href", "")
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
posted_time_str = params.get("posted_time", [None])[0]
if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date
return None
@staticmethod
def get_compensation(job: Tag) -> Optional[Compensation]:
"""
Parses the compensation tag from the job BeautifulSoup object
:param job
:return: Compensation object or None
"""
pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None:
return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
def create_compensation_object(pay_string: str) -> Compensation:
"""
Creates a Compensation object from a pay_string
:param pay_string
:return: compensation
"""
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
amounts = []
for amount in pay_string.split("to"):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = int(float(amount)) * 1000
else:
amount = int(float(amount))
amounts.append(amount)
compensation = Compensation(
interval=interval,
min_amount=min(amounts),
max_amount=max(amounts),
currency="USD/CAD",
)
return compensation
return create_compensation_object(pay)
@staticmethod
def get_location(job: Tag) -> Location:
"""
Extracts the job location from BeatifulSoup object
:param job:
:return: location
"""
location_link = job.find("a", {"class": "company_location"})
if location_link is not None:
location_string = location_link.text.strip()
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
else:
city, state = None, None
else:
city, state = None, None
return Location(city=city, state=state, country=Country.US_CANADA)
@staticmethod
def headers() -> dict:
"""