mirror of https://github.com/Bunsly/JobSpy
348 lines
10 KiB
Python
348 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from itertools import cycle
|
|
|
|
import numpy as np
|
|
import requests
|
|
import tls_client
|
|
import urllib3
|
|
from markdownify import markdownify as md
|
|
from requests.adapters import HTTPAdapter, Retry
|
|
|
|
from jobspy.model import CompensationInterval, JobType, Site
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
|
|
def create_logger(name: str):
|
|
logger = logging.getLogger(f"JobSpy:{name}")
|
|
logger.propagate = False
|
|
if not logger.handlers:
|
|
logger.setLevel(logging.INFO)
|
|
console_handler = logging.StreamHandler()
|
|
format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
|
formatter = logging.Formatter(format)
|
|
console_handler.setFormatter(formatter)
|
|
logger.addHandler(console_handler)
|
|
return logger
|
|
|
|
|
|
class RotatingProxySession:
|
|
def __init__(self, proxies=None):
|
|
if isinstance(proxies, str):
|
|
self.proxy_cycle = cycle([self.format_proxy(proxies)])
|
|
elif isinstance(proxies, list):
|
|
self.proxy_cycle = (
|
|
cycle([self.format_proxy(proxy) for proxy in proxies])
|
|
if proxies
|
|
else None
|
|
)
|
|
else:
|
|
self.proxy_cycle = None
|
|
|
|
@staticmethod
|
|
def format_proxy(proxy):
|
|
"""Utility method to format a proxy string into a dictionary."""
|
|
if proxy.startswith("http://") or proxy.startswith("https://"):
|
|
return {"http": proxy, "https": proxy}
|
|
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
|
|
|
|
|
|
class RequestsRotating(RotatingProxySession, requests.Session):
|
|
|
|
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
|
|
RotatingProxySession.__init__(self, proxies=proxies)
|
|
requests.Session.__init__(self)
|
|
self.clear_cookies = clear_cookies
|
|
self.allow_redirects = True
|
|
self.setup_session(has_retry, delay)
|
|
|
|
def setup_session(self, has_retry, delay):
|
|
if has_retry:
|
|
retries = Retry(
|
|
total=3,
|
|
connect=3,
|
|
status=3,
|
|
status_forcelist=[500, 502, 503, 504, 429],
|
|
backoff_factor=delay,
|
|
)
|
|
adapter = HTTPAdapter(max_retries=retries)
|
|
self.mount("http://", adapter)
|
|
self.mount("https://", adapter)
|
|
|
|
def request(self, method, url, **kwargs):
|
|
if self.clear_cookies:
|
|
self.cookies.clear()
|
|
|
|
if self.proxy_cycle:
|
|
next_proxy = next(self.proxy_cycle)
|
|
if next_proxy["http"] != "http://localhost":
|
|
self.proxies = next_proxy
|
|
else:
|
|
self.proxies = {}
|
|
return requests.Session.request(self, method, url, **kwargs)
|
|
|
|
|
|
class TLSRotating(RotatingProxySession, tls_client.Session):
|
|
|
|
def __init__(self, proxies=None):
|
|
RotatingProxySession.__init__(self, proxies=proxies)
|
|
tls_client.Session.__init__(self, random_tls_extension_order=True)
|
|
|
|
def execute_request(self, *args, **kwargs):
|
|
if self.proxy_cycle:
|
|
next_proxy = next(self.proxy_cycle)
|
|
if next_proxy["http"] != "http://localhost":
|
|
self.proxies = next_proxy
|
|
else:
|
|
self.proxies = {}
|
|
response = tls_client.Session.execute_request(self, *args, **kwargs)
|
|
response.ok = response.status_code in range(200, 400)
|
|
return response
|
|
|
|
|
|
def create_session(
|
|
*,
|
|
proxies: dict | str | None = None,
|
|
ca_cert: str | None = None,
|
|
is_tls: bool = True,
|
|
has_retry: bool = False,
|
|
delay: int = 1,
|
|
clear_cookies: bool = False,
|
|
) -> requests.Session:
|
|
"""
|
|
Creates a requests session with optional tls, proxy, and retry settings.
|
|
:return: A session object
|
|
"""
|
|
if is_tls:
|
|
session = TLSRotating(proxies=proxies)
|
|
else:
|
|
session = RequestsRotating(
|
|
proxies=proxies,
|
|
has_retry=has_retry,
|
|
delay=delay,
|
|
clear_cookies=clear_cookies,
|
|
)
|
|
|
|
if ca_cert:
|
|
session.verify = ca_cert
|
|
|
|
return session
|
|
|
|
|
|
def set_logger_level(verbose: int):
|
|
"""
|
|
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
|
|
|
|
Parameters:
|
|
- verbose: int {0, 1, 2} (default=2, all logs)
|
|
"""
|
|
if verbose is None:
|
|
return
|
|
level_name = {2: "INFO", 1: "WARNING", 0: "ERROR"}.get(verbose, "INFO")
|
|
level = getattr(logging, level_name.upper(), None)
|
|
if level is not None:
|
|
for logger_name in logging.root.manager.loggerDict:
|
|
if logger_name.startswith("JobSpy:"):
|
|
logging.getLogger(logger_name).setLevel(level)
|
|
else:
|
|
raise ValueError(f"Invalid log level: {level_name}")
|
|
|
|
|
|
def markdown_converter(description_html: str):
|
|
if description_html is None:
|
|
return None
|
|
markdown = md(description_html)
|
|
return markdown.strip()
|
|
|
|
|
|
def extract_emails_from_text(text: str) -> list[str] | None:
|
|
if not text:
|
|
return None
|
|
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
|
return email_regex.findall(text)
|
|
|
|
|
|
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
|
"""
|
|
Given a string, returns the corresponding JobType enum member if a match is found.
|
|
"""
|
|
res = None
|
|
for job_type in JobType:
|
|
if job_type_str in job_type.value:
|
|
res = job_type
|
|
return res
|
|
|
|
|
|
def currency_parser(cur_str):
|
|
# Remove any non-numerical characters
|
|
# except for ',' '.' or '-' (e.g. EUR)
|
|
cur_str = re.sub("[^-0-9.,]", "", cur_str)
|
|
# Remove any 000s separators (either , or .)
|
|
cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
|
|
|
|
if "." in list(cur_str[-3:]):
|
|
num = float(cur_str)
|
|
elif "," in list(cur_str[-3:]):
|
|
num = float(cur_str.replace(",", "."))
|
|
else:
|
|
num = float(cur_str)
|
|
|
|
return np.round(num, 2)
|
|
|
|
|
|
def remove_attributes(tag):
|
|
for attr in list(tag.attrs):
|
|
del tag[attr]
|
|
return tag
|
|
|
|
|
|
def extract_salary(
|
|
salary_str,
|
|
lower_limit=1000,
|
|
upper_limit=700000,
|
|
hourly_threshold=350,
|
|
monthly_threshold=30000,
|
|
enforce_annual_salary=False,
|
|
):
|
|
"""
|
|
Extracts salary information from a string and returns the salary interval, min and max salary values, and currency.
|
|
(TODO: Needs test cases as the regex is complicated and may not cover all edge cases)
|
|
"""
|
|
if not salary_str:
|
|
return None, None, None, None
|
|
|
|
annual_max_salary = None
|
|
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
|
|
|
|
def to_int(s):
|
|
return int(float(s.replace(",", "")))
|
|
|
|
def convert_hourly_to_annual(hourly_wage):
|
|
return hourly_wage * 2080
|
|
|
|
def convert_monthly_to_annual(monthly_wage):
|
|
return monthly_wage * 12
|
|
|
|
match = re.search(min_max_pattern, salary_str)
|
|
|
|
if match:
|
|
min_salary = to_int(match.group(1))
|
|
max_salary = to_int(match.group(3))
|
|
# Handle 'k' suffix for min and max salaries independently
|
|
if "k" in match.group(2).lower() or "k" in match.group(4).lower():
|
|
min_salary *= 1000
|
|
max_salary *= 1000
|
|
|
|
# Convert to annual if less than the hourly threshold
|
|
if min_salary < hourly_threshold:
|
|
interval = CompensationInterval.HOURLY.value
|
|
annual_min_salary = convert_hourly_to_annual(min_salary)
|
|
if max_salary < hourly_threshold:
|
|
annual_max_salary = convert_hourly_to_annual(max_salary)
|
|
|
|
elif min_salary < monthly_threshold:
|
|
interval = CompensationInterval.MONTHLY.value
|
|
annual_min_salary = convert_monthly_to_annual(min_salary)
|
|
if max_salary < monthly_threshold:
|
|
annual_max_salary = convert_monthly_to_annual(max_salary)
|
|
|
|
else:
|
|
interval = CompensationInterval.YEARLY.value
|
|
annual_min_salary = min_salary
|
|
annual_max_salary = max_salary
|
|
|
|
# Ensure salary range is within specified limits
|
|
if not annual_max_salary:
|
|
return None, None, None, None
|
|
if (
|
|
lower_limit <= annual_min_salary <= upper_limit
|
|
and lower_limit <= annual_max_salary <= upper_limit
|
|
and annual_min_salary < annual_max_salary
|
|
):
|
|
if enforce_annual_salary:
|
|
return interval, annual_min_salary, annual_max_salary, "USD"
|
|
else:
|
|
return interval, min_salary, max_salary, "USD"
|
|
return None, None, None, None
|
|
|
|
|
|
def extract_job_type(description: str):
|
|
if not description:
|
|
return []
|
|
|
|
keywords = {
|
|
JobType.FULL_TIME: r"full\s?time",
|
|
JobType.PART_TIME: r"part\s?time",
|
|
JobType.INTERNSHIP: r"internship",
|
|
JobType.CONTRACT: r"contract",
|
|
}
|
|
|
|
listing_types = []
|
|
for key, pattern in keywords.items():
|
|
if re.search(pattern, description, re.IGNORECASE):
|
|
listing_types.append(key)
|
|
|
|
return listing_types if listing_types else None
|
|
|
|
|
|
def map_str_to_site(site_name: str) -> Site:
|
|
return Site[site_name.upper()]
|
|
|
|
|
|
def get_enum_from_value(value_str):
|
|
for job_type in JobType:
|
|
if value_str in job_type.value:
|
|
return job_type
|
|
raise Exception(f"Invalid job type: {value_str}")
|
|
|
|
|
|
def convert_to_annual(job_data: dict):
|
|
if job_data["interval"] == "hourly":
|
|
job_data["min_amount"] *= 2080
|
|
job_data["max_amount"] *= 2080
|
|
if job_data["interval"] == "monthly":
|
|
job_data["min_amount"] *= 12
|
|
job_data["max_amount"] *= 12
|
|
if job_data["interval"] == "weekly":
|
|
job_data["min_amount"] *= 52
|
|
job_data["max_amount"] *= 52
|
|
if job_data["interval"] == "daily":
|
|
job_data["min_amount"] *= 260
|
|
job_data["max_amount"] *= 260
|
|
job_data["interval"] = "yearly"
|
|
|
|
|
|
desired_order = [
|
|
"id",
|
|
"site",
|
|
"job_url",
|
|
"job_url_direct",
|
|
"title",
|
|
"company",
|
|
"location",
|
|
"date_posted",
|
|
"job_type",
|
|
"salary_source",
|
|
"interval",
|
|
"min_amount",
|
|
"max_amount",
|
|
"currency",
|
|
"is_remote",
|
|
"job_level",
|
|
"job_function",
|
|
"listing_type",
|
|
"emails",
|
|
"description",
|
|
"company_industry",
|
|
"company_url",
|
|
"company_logo",
|
|
"company_url_direct",
|
|
"company_addresses",
|
|
"company_num_employees",
|
|
"company_revenue",
|
|
"company_description",
|
|
]
|