mirror of https://github.com/Bunsly/JobSpy
parent
b303b3f841
commit
e3fc222eb5
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.21"
|
version = "1.1.22"
|
||||||
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|
|
@ -58,7 +58,6 @@ class IndeedScraper(Scraper):
|
||||||
self.country = scraper_input.country
|
self.country = scraper_input.country
|
||||||
domain = self.country.domain_value
|
domain = self.country.domain_value
|
||||||
self.url = f"https://{domain}.indeed.com"
|
self.url = f"https://{domain}.indeed.com"
|
||||||
session = create_session(self.proxy)
|
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"q": scraper_input.search_term,
|
"q": scraper_input.search_term,
|
||||||
|
@ -78,6 +77,7 @@ class IndeedScraper(Scraper):
|
||||||
if sc_values:
|
if sc_values:
|
||||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||||
try:
|
try:
|
||||||
|
session = create_session(self.proxy, is_tls=True)
|
||||||
response = session.get(
|
response = session.get(
|
||||||
f"{self.url}/jobs",
|
f"{self.url}/jobs",
|
||||||
headers=self.get_headers(),
|
headers=self.get_headers(),
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
import tls_client
|
import tls_client
|
||||||
from ..jobs import JobType
|
from ..jobs import JobType
|
||||||
|
|
||||||
|
@ -24,12 +26,13 @@ def extract_emails_from_text(text: str) -> list[str] | None:
|
||||||
return email_regex.findall(text)
|
return email_regex.findall(text)
|
||||||
|
|
||||||
|
|
||||||
def create_session(proxy: str | None = None):
|
def create_session(proxy: dict | None = None, is_tls: bool = True):
|
||||||
"""
|
"""
|
||||||
Creates a tls client session
|
Creates a tls client session
|
||||||
|
|
||||||
:return: A session object with or without proxies.
|
:return: A session object with or without proxies.
|
||||||
"""
|
"""
|
||||||
|
if is_tls:
|
||||||
session = tls_client.Session(
|
session = tls_client.Session(
|
||||||
client_identifier="chrome112",
|
client_identifier="chrome112",
|
||||||
random_tls_extension_order=True,
|
random_tls_extension_order=True,
|
||||||
|
@ -41,6 +44,10 @@ def create_session(proxy: str | None = None):
|
||||||
# "http": random.choice(self.proxies),
|
# "http": random.choice(self.proxies),
|
||||||
# "https": random.choice(self.proxies),
|
# "https": random.choice(self.proxies),
|
||||||
# }
|
# }
|
||||||
|
else:
|
||||||
|
session = requests.Session()
|
||||||
|
session.allow_redirects = True
|
||||||
|
session.proxies.update(proxy)
|
||||||
|
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
|
@ -9,25 +9,14 @@ import time
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from typing import Optional, Tuple, Any
|
from typing import Optional, Tuple, Any
|
||||||
from urllib.parse import urlparse, parse_qs, urlunparse
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import ZipRecruiterException
|
from ..exceptions import ZipRecruiterException
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
||||||
from ...jobs import (
|
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType
|
||||||
JobPost,
|
|
||||||
Compensation,
|
|
||||||
CompensationInterval,
|
|
||||||
Location,
|
|
||||||
JobResponse,
|
|
||||||
JobType,
|
|
||||||
Country,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
|
@ -42,21 +31,22 @@ class ZipRecruiterScraper(Scraper):
|
||||||
self.jobs_per_page = 20
|
self.jobs_per_page = 20
|
||||||
self.seen_urls = set()
|
self.seen_urls = set()
|
||||||
|
|
||||||
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]:
|
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
|
:param continue_token:
|
||||||
:return: jobs found on page
|
:return: jobs found on page
|
||||||
"""
|
"""
|
||||||
params = self.add_params(scraper_input)
|
params = self.add_params(scraper_input)
|
||||||
if continue_token:
|
if continue_token:
|
||||||
params['continue'] = continue_token
|
params['continue'] = continue_token
|
||||||
try:
|
try:
|
||||||
response = requests.get(
|
session = create_session(self.proxy, is_tls=False)
|
||||||
|
response = session.get(
|
||||||
f"https://api.ziprecruiter.com/jobs-app/jobs",
|
f"https://api.ziprecruiter.com/jobs-app/jobs",
|
||||||
headers=self.headers(),
|
headers=self.headers(),
|
||||||
params=self.add_params(scraper_input),
|
params=self.add_params(scraper_input),
|
||||||
allow_redirects=True,
|
|
||||||
timeout=10,
|
timeout=10,
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
|
@ -73,7 +63,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
jobs_list = response_data.get("jobs", [])
|
jobs_list = response_data.get("jobs", [])
|
||||||
next_continue_token = response_data.get('continue', None)
|
next_continue_token = response_data.get('continue', None)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||||
job_results = [
|
job_results = [
|
||||||
executor.submit(self.process_job, job)
|
executor.submit(self.process_job, job)
|
||||||
for job in jobs_list
|
for job in jobs_list
|
||||||
|
@ -109,12 +99,12 @@ class ZipRecruiterScraper(Scraper):
|
||||||
|
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def process_job(self, job: dict) -> JobPost:
|
@staticmethod
|
||||||
"""the most common type of jobs page on ZR"""
|
def process_job(job: dict) -> JobPost:
|
||||||
|
""" Processes an individual job dict from the response """
|
||||||
title = job.get("name")
|
title = job.get("name")
|
||||||
job_url = job.get("job_url")
|
job_url = job.get("job_url")
|
||||||
|
|
||||||
|
|
||||||
description = BeautifulSoup(
|
description = BeautifulSoup(
|
||||||
job.get("job_description", "").strip(), "html.parser"
|
job.get("job_description", "").strip(), "html.parser"
|
||||||
).get_text()
|
).get_text()
|
||||||
|
@ -144,7 +134,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
location=location,
|
location=location,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=Compensation(
|
compensation=Compensation(
|
||||||
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval") ,
|
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
|
||||||
min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
|
min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
|
||||||
max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
|
max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
|
||||||
currency=job.get("compensation_currency"),
|
currency=job.get("compensation_currency"),
|
||||||
|
@ -192,107 +182,6 @@ class ZipRecruiterScraper(Scraper):
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_interval(interval_str: str):
|
|
||||||
"""
|
|
||||||
Maps the interval alias to its appropriate CompensationInterval.
|
|
||||||
:param interval_str
|
|
||||||
:return: CompensationInterval
|
|
||||||
"""
|
|
||||||
interval_alias = {"annually": CompensationInterval.YEARLY}
|
|
||||||
interval_str = interval_str.lower()
|
|
||||||
|
|
||||||
if interval_str in interval_alias:
|
|
||||||
return interval_alias[interval_str]
|
|
||||||
|
|
||||||
return CompensationInterval(interval_str)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_date_posted(job: Tag) -> Optional[datetime.date]:
|
|
||||||
"""
|
|
||||||
Extracts the date a job was posted
|
|
||||||
:param job
|
|
||||||
:return: date the job was posted or None
|
|
||||||
"""
|
|
||||||
button = job.find(
|
|
||||||
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
|
|
||||||
)
|
|
||||||
if not button:
|
|
||||||
return None
|
|
||||||
|
|
||||||
url_time = button.get("data-href", "")
|
|
||||||
url_components = urlparse(url_time)
|
|
||||||
params = parse_qs(url_components.query)
|
|
||||||
posted_time_str = params.get("posted_time", [None])[0]
|
|
||||||
|
|
||||||
if posted_time_str:
|
|
||||||
posted_date = datetime.strptime(
|
|
||||||
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
|
|
||||||
).date()
|
|
||||||
return posted_date
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_compensation(job: Tag) -> Optional[Compensation]:
|
|
||||||
"""
|
|
||||||
Parses the compensation tag from the job BeautifulSoup object
|
|
||||||
:param job
|
|
||||||
:return: Compensation object or None
|
|
||||||
"""
|
|
||||||
pay_element = job.find("li", {"class": "perk_item perk_pay"})
|
|
||||||
if pay_element is None:
|
|
||||||
return None
|
|
||||||
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
|
|
||||||
|
|
||||||
def create_compensation_object(pay_string: str) -> Compensation:
|
|
||||||
"""
|
|
||||||
Creates a Compensation object from a pay_string
|
|
||||||
:param pay_string
|
|
||||||
:return: compensation
|
|
||||||
"""
|
|
||||||
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
|
|
||||||
|
|
||||||
amounts = []
|
|
||||||
for amount in pay_string.split("to"):
|
|
||||||
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
|
|
||||||
if "K" in amount:
|
|
||||||
amount = amount.replace("K", "")
|
|
||||||
amount = int(float(amount)) * 1000
|
|
||||||
else:
|
|
||||||
amount = int(float(amount))
|
|
||||||
amounts.append(amount)
|
|
||||||
|
|
||||||
compensation = Compensation(
|
|
||||||
interval=interval,
|
|
||||||
min_amount=min(amounts),
|
|
||||||
max_amount=max(amounts),
|
|
||||||
currency="USD/CAD",
|
|
||||||
)
|
|
||||||
|
|
||||||
return compensation
|
|
||||||
|
|
||||||
return create_compensation_object(pay)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_location(job: Tag) -> Location:
|
|
||||||
"""
|
|
||||||
Extracts the job location from BeatifulSoup object
|
|
||||||
:param job:
|
|
||||||
:return: location
|
|
||||||
"""
|
|
||||||
location_link = job.find("a", {"class": "company_location"})
|
|
||||||
if location_link is not None:
|
|
||||||
location_string = location_link.text.strip()
|
|
||||||
parts = location_string.split(", ")
|
|
||||||
if len(parts) == 2:
|
|
||||||
city, state = parts
|
|
||||||
else:
|
|
||||||
city, state = None, None
|
|
||||||
else:
|
|
||||||
city, state = None, None
|
|
||||||
return Location(city=city, state=state, country=Country.US_CANADA)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def headers() -> dict:
|
def headers() -> dict:
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue