readd proxy support for zip (#64)

pull/66/head v1.1.22
Cullen Watson 2023-10-29 08:54:56 -05:00 committed by GitHub
parent b303b3f841
commit e3fc222eb5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 136 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.21" version = "1.1.22"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -58,7 +58,6 @@ class IndeedScraper(Scraper):
self.country = scraper_input.country self.country = scraper_input.country
domain = self.country.domain_value domain = self.country.domain_value
self.url = f"https://{domain}.indeed.com" self.url = f"https://{domain}.indeed.com"
session = create_session(self.proxy)
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
@ -78,6 +77,7 @@ class IndeedScraper(Scraper):
if sc_values: if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";" params["sc"] = "0kf:" + "".join(sc_values) + ";"
try: try:
session = create_session(self.proxy, is_tls=True)
response = session.get( response = session.get(
f"{self.url}/jobs", f"{self.url}/jobs",
headers=self.get_headers(), headers=self.get_headers(),

View File

@ -1,4 +1,6 @@
import re import re
import requests
import tls_client import tls_client
from ..jobs import JobType from ..jobs import JobType
@ -24,23 +26,28 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text) return email_regex.findall(text)
def create_session(proxy: str | None = None): def create_session(proxy: dict | None = None, is_tls: bool = True):
""" """
Creates a tls client session Creates a tls client session
:return: A session object with or without proxies. :return: A session object with or without proxies.
""" """
session = tls_client.Session( if is_tls:
client_identifier="chrome112", session = tls_client.Session(
random_tls_extension_order=True, client_identifier="chrome112",
) random_tls_extension_order=True,
session.proxies = proxy )
# TODO multiple proxies session.proxies = proxy
# if self.proxies: # TODO multiple proxies
# session.proxies = { # if self.proxies:
# "http": random.choice(self.proxies), # session.proxies = {
# "https": random.choice(self.proxies), # "http": random.choice(self.proxies),
# } # "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True
session.proxies.update(proxy)
return session return session

View File

@ -9,25 +9,14 @@ import time
import re import re
from datetime import datetime, date from datetime import datetime, date
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
from urllib.parse import urlparse, parse_qs, urlunparse
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import ( from ...jobs import JobPost, Compensation, Location, JobResponse, JobType
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
Country,
)
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@ -42,21 +31,22 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]: def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:param continue_token:
:return: jobs found on page :return: jobs found on page
""" """
params = self.add_params(scraper_input) params = self.add_params(scraper_input)
if continue_token: if continue_token:
params['continue'] = continue_token params['continue'] = continue_token
try: try:
response = requests.get( session = create_session(self.proxy, is_tls=False)
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs", f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(), headers=self.headers(),
params=self.add_params(scraper_input), params=self.add_params(scraper_input),
allow_redirects=True,
timeout=10, timeout=10,
) )
if response.status_code != 200: if response.status_code != 200:
@ -73,7 +63,7 @@ class ZipRecruiterScraper(Scraper):
jobs_list = response_data.get("jobs", []) jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get('continue', None) next_continue_token = response_data.get('continue', None)
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_results = [ job_results = [
executor.submit(self.process_job, job) executor.submit(self.process_job, job)
for job in jobs_list for job in jobs_list
@ -109,12 +99,12 @@ class ZipRecruiterScraper(Scraper):
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job(self, job: dict) -> JobPost: @staticmethod
"""the most common type of jobs page on ZR""" def process_job(job: dict) -> JobPost:
""" Processes an individual job dict from the response """
title = job.get("name") title = job.get("name")
job_url = job.get("job_url") job_url = job.get("job_url")
description = BeautifulSoup( description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser" job.get("job_description", "").strip(), "html.parser"
).get_text() ).get_text()
@ -144,7 +134,7 @@ class ZipRecruiterScraper(Scraper):
location=location, location=location,
job_type=job_type, job_type=job_type,
compensation=Compensation( compensation=Compensation(
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval") , interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
min_amount=int(job["compensation_min"]) if "compensation_min" in job else None, min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
max_amount=int(job["compensation_max"]) if "compensation_max" in job else None, max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
currency=job.get("compensation_currency"), currency=job.get("compensation_currency"),
@ -192,107 +182,6 @@ class ZipRecruiterScraper(Scraper):
return params return params
@staticmethod
def get_interval(interval_str: str):
"""
Maps the interval alias to its appropriate CompensationInterval.
:param interval_str
:return: CompensationInterval
"""
interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower()
if interval_str in interval_alias:
return interval_alias[interval_str]
return CompensationInterval(interval_str)
@staticmethod
def get_date_posted(job: Tag) -> Optional[datetime.date]:
"""
Extracts the date a job was posted
:param job
:return: date the job was posted or None
"""
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
if not button:
return None
url_time = button.get("data-href", "")
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
posted_time_str = params.get("posted_time", [None])[0]
if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date
return None
@staticmethod
def get_compensation(job: Tag) -> Optional[Compensation]:
"""
Parses the compensation tag from the job BeautifulSoup object
:param job
:return: Compensation object or None
"""
pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None:
return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
def create_compensation_object(pay_string: str) -> Compensation:
"""
Creates a Compensation object from a pay_string
:param pay_string
:return: compensation
"""
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
amounts = []
for amount in pay_string.split("to"):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = int(float(amount)) * 1000
else:
amount = int(float(amount))
amounts.append(amount)
compensation = Compensation(
interval=interval,
min_amount=min(amounts),
max_amount=max(amounts),
currency="USD/CAD",
)
return compensation
return create_compensation_object(pay)
@staticmethod
def get_location(job: Tag) -> Location:
"""
Extracts the job location from BeatifulSoup object
:param job:
:return: location
"""
location_link = job.find("a", {"class": "company_location"})
if location_link is not None:
location_string = location_link.text.strip()
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
else:
city, state = None, None
else:
city, state = None, None
return Location(city=city, state=state, country=Country.US_CANADA)
@staticmethod @staticmethod
def headers() -> dict: def headers() -> dict:
""" """