Compare commits

...

7 Commits

Author SHA1 Message Date
Cullen Watson
e3fc222eb5 readd proxy support for zip (#64) 2023-10-29 08:54:56 -05:00
Cullen
b303b3f841 chore: version 2023-10-28 16:58:32 -05:00
Cullen
1a0c75f323 chore: version 2023-10-28 16:54:04 -05:00
Cullen
e2f6885d61 chore: format 2023-10-28 16:52:05 -05:00
Cullen
8d65d1b652 [chore] version 2023-10-28 16:43:44 -05:00
Cullen
216d3fd39f ziprecruiter: 5s delay 2023-10-28 16:41:32 -05:00
Cullen Watson
d3bfdc0a6e ziprecruiter api (#63) 2023-10-28 16:17:28 -05:00
5 changed files with 42 additions and 140 deletions

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.15" version = "1.1.22"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -177,8 +177,8 @@ class CompensationInterval(Enum):
class Compensation(BaseModel): class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None interval: Optional[CompensationInterval] = None
min_amount: int = None min_amount: int | None = None
max_amount: int = None max_amount: int | None = None
currency: Optional[str] = "USD" currency: Optional[str] = "USD"

View File

@@ -58,7 +58,6 @@ class IndeedScraper(Scraper):
self.country = scraper_input.country self.country = scraper_input.country
domain = self.country.domain_value domain = self.country.domain_value
self.url = f"https://{domain}.indeed.com" self.url = f"https://{domain}.indeed.com"
session = create_session(self.proxy)
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
@@ -78,6 +77,7 @@ class IndeedScraper(Scraper):
if sc_values: if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";" params["sc"] = "0kf:" + "".join(sc_values) + ";"
try: try:
session = create_session(self.proxy, is_tls=True)
response = session.get( response = session.get(
f"{self.url}/jobs", f"{self.url}/jobs",
headers=self.get_headers(), headers=self.get_headers(),

View File

@@ -1,4 +1,6 @@
import re import re
import requests
import tls_client import tls_client
from ..jobs import JobType from ..jobs import JobType
@@ -24,23 +26,28 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text) return email_regex.findall(text)
def create_session(proxy: str | None = None): def create_session(proxy: dict | None = None, is_tls: bool = True):
""" """
Creates a tls client session Creates a tls client session
:return: A session object with or without proxies. :return: A session object with or without proxies.
""" """
session = tls_client.Session( if is_tls:
client_identifier="chrome112", session = tls_client.Session(
random_tls_extension_order=True, client_identifier="chrome112",
) random_tls_extension_order=True,
session.proxies = proxy )
# TODO multiple proxies session.proxies = proxy
# if self.proxies: # TODO multiple proxies
# session.proxies = { # if self.proxies:
# "http": random.choice(self.proxies), # session.proxies = {
# "https": random.choice(self.proxies), # "http": random.choice(self.proxies),
# } # "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True
session.proxies.update(proxy)
return session return session

View File

@@ -5,29 +5,18 @@ jobspy.scrapers.ziprecruiter
This module contains routines to scrape ZipRecruiter. This module contains routines to scrape ZipRecruiter.
""" """
import math import math
import json import time
import re import re
from datetime import datetime, date from datetime import datetime, date
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
from urllib.parse import urlparse, parse_qs, urlunparse
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import ( from ...jobs import JobPost, Compensation, Location, JobResponse, JobType
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
Country,
)
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@@ -42,21 +31,22 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]: def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:param continue_token:
:return: jobs found on page :return: jobs found on page
""" """
params = self.add_params(scraper_input) params = self.add_params(scraper_input)
if continue_token: if continue_token:
params['continue'] = continue_token params['continue'] = continue_token
try: try:
response = requests.get( session = create_session(self.proxy, is_tls=False)
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs", f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(), headers=self.headers(),
params=self.add_params(scraper_input), params=self.add_params(scraper_input),
allow_redirects=True,
timeout=10, timeout=10,
) )
if response.status_code != 200: if response.status_code != 200:
@@ -68,11 +58,12 @@ class ZipRecruiterScraper(Scraper):
raise ZipRecruiterException("bad proxy") raise ZipRecruiterException("bad proxy")
raise ZipRecruiterException(str(e)) raise ZipRecruiterException(str(e))
time.sleep(5)
response_data = response.json() response_data = response.json()
jobs_list = response_data.get("jobs", []) jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get('continue', None) next_continue_token = response_data.get('continue', None)
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_results = [ job_results = [
executor.submit(self.process_job, job) executor.submit(self.process_job, job)
for job in jobs_list for job in jobs_list
@@ -108,17 +99,17 @@ class ZipRecruiterScraper(Scraper):
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job(self, job: dict) -> JobPost: @staticmethod
"""the most common type of jobs page on ZR""" def process_job(job: dict) -> JobPost:
""" Processes an individual job dict from the response """
title = job.get("name") title = job.get("name")
job_url = job.get("job_url") job_url = job.get("job_url")
# job_url = updated_job_url if updated_job_url else job_url
description = BeautifulSoup( description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser" job.get("job_description", "").strip(), "html.parser"
).get_text() ).get_text()
company = job.get("source") company = job['hiring_company'].get("name") if "hiring_company" in job else None
location = Location( location = Location(
city=job.get("job_city"), state=job.get("job_state"), country='usa' if job.get("job_country") == 'US' else 'canada' city=job.get("job_city"), state=job.get("job_state"), country='usa' if job.get("job_country") == 'US' else 'canada'
) )
@@ -142,7 +133,12 @@ class ZipRecruiterScraper(Scraper):
company_name=company, company_name=company,
location=location, location=location,
job_type=job_type, job_type=job_type,
# compensation=compensation, compensation=Compensation(
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
currency=job.get("compensation_currency"),
),
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
description=description, description=description,
@@ -186,107 +182,6 @@ class ZipRecruiterScraper(Scraper):
return params return params
@staticmethod
def get_interval(interval_str: str):
"""
Maps the interval alias to its appropriate CompensationInterval.
:param interval_str
:return: CompensationInterval
"""
interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower()
if interval_str in interval_alias:
return interval_alias[interval_str]
return CompensationInterval(interval_str)
@staticmethod
def get_date_posted(job: Tag) -> Optional[datetime.date]:
"""
Extracts the date a job was posted
:param job
:return: date the job was posted or None
"""
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
if not button:
return None
url_time = button.get("data-href", "")
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
posted_time_str = params.get("posted_time", [None])[0]
if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date
return None
@staticmethod
def get_compensation(job: Tag) -> Optional[Compensation]:
"""
Parses the compensation tag from the job BeautifulSoup object
:param job
:return: Compensation object or None
"""
pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None:
return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
def create_compensation_object(pay_string: str) -> Compensation:
"""
Creates a Compensation object from a pay_string
:param pay_string
:return: compensation
"""
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
amounts = []
for amount in pay_string.split("to"):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = int(float(amount)) * 1000
else:
amount = int(float(amount))
amounts.append(amount)
compensation = Compensation(
interval=interval,
min_amount=min(amounts),
max_amount=max(amounts),
currency="USD/CAD",
)
return compensation
return create_compensation_object(pay)
@staticmethod
def get_location(job: Tag) -> Location:
"""
Extracts the job location from BeatifulSoup object
:param job:
:return: location
"""
location_link = job.find("a", {"class": "company_location"})
if location_link is not None:
location_string = location_link.text.strip()
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
else:
city, state = None, None
else:
city, state = None, None
return Location(city=city, state=state, country=Country.US_CANADA)
@staticmethod @staticmethod
def headers() -> dict: def headers() -> dict:
""" """