mirror of https://github.com/Bunsly/JobSpy
315 lines
9.1 KiB
Python
315 lines
9.1 KiB
Python
|
from __future__ import annotations
|
|||
|
|
|||
|
from abc import ABC, abstractmethod
|
|||
|
from typing import Optional
|
|||
|
from datetime import date
|
|||
|
from enum import Enum
|
|||
|
from pydantic import BaseModel
|
|||
|
|
|||
|
|
|||
|
class JobType(Enum):
|
|||
|
FULL_TIME = (
|
|||
|
"fulltime",
|
|||
|
"períodointegral",
|
|||
|
"estágio/trainee",
|
|||
|
"cunormăîntreagă",
|
|||
|
"tiempocompleto",
|
|||
|
"vollzeit",
|
|||
|
"voltijds",
|
|||
|
"tempointegral",
|
|||
|
"全职",
|
|||
|
"plnýúvazek",
|
|||
|
"fuldtid",
|
|||
|
"دوامكامل",
|
|||
|
"kokopäivätyö",
|
|||
|
"tempsplein",
|
|||
|
"vollzeit",
|
|||
|
"πλήρηςαπασχόληση",
|
|||
|
"teljesmunkaidő",
|
|||
|
"tempopieno",
|
|||
|
"tempsplein",
|
|||
|
"heltid",
|
|||
|
"jornadacompleta",
|
|||
|
"pełnyetat",
|
|||
|
"정규직",
|
|||
|
"100%",
|
|||
|
"全職",
|
|||
|
"งานประจำ",
|
|||
|
"tamzamanlı",
|
|||
|
"повназайнятість",
|
|||
|
"toànthờigian",
|
|||
|
)
|
|||
|
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
|
|||
|
CONTRACT = ("contract", "contractor")
|
|||
|
TEMPORARY = ("temporary",)
|
|||
|
INTERNSHIP = (
|
|||
|
"internship",
|
|||
|
"prácticas",
|
|||
|
"ojt(onthejobtraining)",
|
|||
|
"praktikum",
|
|||
|
"praktik",
|
|||
|
)
|
|||
|
|
|||
|
PER_DIEM = ("perdiem",)
|
|||
|
NIGHTS = ("nights",)
|
|||
|
OTHER = ("other",)
|
|||
|
SUMMER = ("summer",)
|
|||
|
VOLUNTEER = ("volunteer",)
|
|||
|
|
|||
|
|
|||
|
class Country(Enum):
|
|||
|
"""
|
|||
|
Gets the subdomain for Indeed and Glassdoor.
|
|||
|
The second item in the tuple is the subdomain (and API country code if there's a ':' separator) for Indeed
|
|||
|
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
|
|||
|
"""
|
|||
|
|
|||
|
ARGENTINA = ("argentina", "ar", "com.ar")
|
|||
|
AUSTRALIA = ("australia", "au", "com.au")
|
|||
|
AUSTRIA = ("austria", "at", "at")
|
|||
|
BAHRAIN = ("bahrain", "bh")
|
|||
|
BELGIUM = ("belgium", "be", "fr:be")
|
|||
|
BRAZIL = ("brazil", "br", "com.br")
|
|||
|
CANADA = ("canada", "ca", "ca")
|
|||
|
CHILE = ("chile", "cl")
|
|||
|
CHINA = ("china", "cn")
|
|||
|
COLOMBIA = ("colombia", "co")
|
|||
|
COSTARICA = ("costa rica", "cr")
|
|||
|
CZECHREPUBLIC = ("czech republic,czechia", "cz")
|
|||
|
DENMARK = ("denmark", "dk")
|
|||
|
ECUADOR = ("ecuador", "ec")
|
|||
|
EGYPT = ("egypt", "eg")
|
|||
|
FINLAND = ("finland", "fi")
|
|||
|
FRANCE = ("france", "fr", "fr")
|
|||
|
GERMANY = ("germany", "de", "de")
|
|||
|
GREECE = ("greece", "gr")
|
|||
|
HONGKONG = ("hong kong", "hk", "com.hk")
|
|||
|
HUNGARY = ("hungary", "hu")
|
|||
|
INDIA = ("india", "in", "co.in")
|
|||
|
INDONESIA = ("indonesia", "id")
|
|||
|
IRELAND = ("ireland", "ie", "ie")
|
|||
|
ISRAEL = ("israel", "il")
|
|||
|
ITALY = ("italy", "it", "it")
|
|||
|
JAPAN = ("japan", "jp")
|
|||
|
KUWAIT = ("kuwait", "kw")
|
|||
|
LUXEMBOURG = ("luxembourg", "lu")
|
|||
|
MALAYSIA = ("malaysia", "malaysia:my", "com")
|
|||
|
MALTA = ("malta", "malta:mt", "mt")
|
|||
|
MEXICO = ("mexico", "mx", "com.mx")
|
|||
|
MOROCCO = ("morocco", "ma")
|
|||
|
NETHERLANDS = ("netherlands", "nl", "nl")
|
|||
|
NEWZEALAND = ("new zealand", "nz", "co.nz")
|
|||
|
NIGERIA = ("nigeria", "ng")
|
|||
|
NORWAY = ("norway", "no")
|
|||
|
OMAN = ("oman", "om")
|
|||
|
PAKISTAN = ("pakistan", "pk")
|
|||
|
PANAMA = ("panama", "pa")
|
|||
|
PERU = ("peru", "pe")
|
|||
|
PHILIPPINES = ("philippines", "ph")
|
|||
|
POLAND = ("poland", "pl")
|
|||
|
PORTUGAL = ("portugal", "pt")
|
|||
|
QATAR = ("qatar", "qa")
|
|||
|
ROMANIA = ("romania", "ro")
|
|||
|
SAUDIARABIA = ("saudi arabia", "sa")
|
|||
|
SINGAPORE = ("singapore", "sg", "sg")
|
|||
|
SOUTHAFRICA = ("south africa", "za")
|
|||
|
SOUTHKOREA = ("south korea", "kr")
|
|||
|
SPAIN = ("spain", "es", "es")
|
|||
|
SWEDEN = ("sweden", "se")
|
|||
|
SWITZERLAND = ("switzerland", "ch", "de:ch")
|
|||
|
TAIWAN = ("taiwan", "tw")
|
|||
|
THAILAND = ("thailand", "th")
|
|||
|
TURKEY = ("türkiye,turkey", "tr")
|
|||
|
UKRAINE = ("ukraine", "ua")
|
|||
|
UNITEDARABEMIRATES = ("united arab emirates", "ae")
|
|||
|
UK = ("uk,united kingdom", "uk:gb", "co.uk")
|
|||
|
USA = ("usa,us,united states", "www:us", "com")
|
|||
|
URUGUAY = ("uruguay", "uy")
|
|||
|
VENEZUELA = ("venezuela", "ve")
|
|||
|
VIETNAM = ("vietnam", "vn", "com")
|
|||
|
|
|||
|
# internal for ziprecruiter
|
|||
|
US_CANADA = ("usa/ca", "www")
|
|||
|
|
|||
|
# internal for linkedin
|
|||
|
WORLDWIDE = ("worldwide", "www")
|
|||
|
|
|||
|
@property
|
|||
|
def indeed_domain_value(self):
|
|||
|
subdomain, _, api_country_code = self.value[1].partition(":")
|
|||
|
if subdomain and api_country_code:
|
|||
|
return subdomain, api_country_code.upper()
|
|||
|
return self.value[1], self.value[1].upper()
|
|||
|
|
|||
|
@property
|
|||
|
def glassdoor_domain_value(self):
|
|||
|
if len(self.value) == 3:
|
|||
|
subdomain, _, domain = self.value[2].partition(":")
|
|||
|
if subdomain and domain:
|
|||
|
return f"{subdomain}.glassdoor.{domain}"
|
|||
|
else:
|
|||
|
return f"www.glassdoor.{self.value[2]}"
|
|||
|
else:
|
|||
|
raise Exception(f"Glassdoor is not available for {self.name}")
|
|||
|
|
|||
|
def get_glassdoor_url(self):
|
|||
|
return f"https://{self.glassdoor_domain_value}/"
|
|||
|
|
|||
|
@classmethod
|
|||
|
def from_string(cls, country_str: str):
|
|||
|
"""Convert a string to the corresponding Country enum."""
|
|||
|
country_str = country_str.strip().lower()
|
|||
|
for country in cls:
|
|||
|
country_names = country.value[0].split(",")
|
|||
|
if country_str in country_names:
|
|||
|
return country
|
|||
|
valid_countries = [country.value for country in cls]
|
|||
|
raise ValueError(
|
|||
|
f"Invalid country string: '{country_str}'. Valid countries are: {', '.join([country[0] for country in valid_countries])}"
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
class Location(BaseModel):
|
|||
|
country: Country | str | None = None
|
|||
|
city: Optional[str] = None
|
|||
|
state: Optional[str] = None
|
|||
|
|
|||
|
def display_location(self) -> str:
|
|||
|
location_parts = []
|
|||
|
if self.city:
|
|||
|
location_parts.append(self.city)
|
|||
|
if self.state:
|
|||
|
location_parts.append(self.state)
|
|||
|
if isinstance(self.country, str):
|
|||
|
location_parts.append(self.country)
|
|||
|
elif self.country and self.country not in (
|
|||
|
Country.US_CANADA,
|
|||
|
Country.WORLDWIDE,
|
|||
|
):
|
|||
|
country_name = self.country.value[0]
|
|||
|
if "," in country_name:
|
|||
|
country_name = country_name.split(",")[0]
|
|||
|
if country_name in ("usa", "uk"):
|
|||
|
location_parts.append(country_name.upper())
|
|||
|
else:
|
|||
|
location_parts.append(country_name.title())
|
|||
|
return ", ".join(location_parts)
|
|||
|
|
|||
|
|
|||
|
class CompensationInterval(Enum):
|
|||
|
YEARLY = "yearly"
|
|||
|
MONTHLY = "monthly"
|
|||
|
WEEKLY = "weekly"
|
|||
|
DAILY = "daily"
|
|||
|
HOURLY = "hourly"
|
|||
|
|
|||
|
@classmethod
|
|||
|
def get_interval(cls, pay_period):
|
|||
|
interval_mapping = {
|
|||
|
"YEAR": cls.YEARLY,
|
|||
|
"HOUR": cls.HOURLY,
|
|||
|
}
|
|||
|
if pay_period in interval_mapping:
|
|||
|
return interval_mapping[pay_period].value
|
|||
|
else:
|
|||
|
return cls[pay_period].value if pay_period in cls.__members__ else None
|
|||
|
|
|||
|
|
|||
|
class Compensation(BaseModel):
|
|||
|
interval: Optional[CompensationInterval] = None
|
|||
|
min_amount: float | None = None
|
|||
|
max_amount: float | None = None
|
|||
|
currency: Optional[str] = "USD"
|
|||
|
|
|||
|
|
|||
|
class DescriptionFormat(Enum):
|
|||
|
MARKDOWN = "markdown"
|
|||
|
HTML = "html"
|
|||
|
|
|||
|
|
|||
|
class JobPost(BaseModel):
|
|||
|
id: str | None = None
|
|||
|
title: str
|
|||
|
company_name: str | None
|
|||
|
job_url: str
|
|||
|
job_url_direct: str | None = None
|
|||
|
location: Optional[Location]
|
|||
|
|
|||
|
description: str | None = None
|
|||
|
company_url: str | None = None
|
|||
|
company_url_direct: str | None = None
|
|||
|
|
|||
|
job_type: list[JobType] | None = None
|
|||
|
compensation: Compensation | None = None
|
|||
|
date_posted: date | None = None
|
|||
|
emails: list[str] | None = None
|
|||
|
is_remote: bool | None = None
|
|||
|
listing_type: str | None = None
|
|||
|
|
|||
|
# linkedin specific
|
|||
|
job_level: str | None = None
|
|||
|
|
|||
|
# linkedin and indeed specific
|
|||
|
company_industry: str | None = None
|
|||
|
|
|||
|
# indeed specific
|
|||
|
company_addresses: str | None = None
|
|||
|
company_num_employees: str | None = None
|
|||
|
company_revenue: str | None = None
|
|||
|
company_description: str | None = None
|
|||
|
company_logo: str | None = None
|
|||
|
banner_photo_url: str | None = None
|
|||
|
|
|||
|
# linkedin only atm
|
|||
|
job_function: str | None = None
|
|||
|
|
|||
|
|
|||
|
class JobResponse(BaseModel):
|
|||
|
jobs: list[JobPost] = []
|
|||
|
|
|||
|
|
|||
|
class Site(Enum):
|
|||
|
LINKEDIN = "linkedin"
|
|||
|
INDEED = "indeed"
|
|||
|
ZIP_RECRUITER = "zip_recruiter"
|
|||
|
GLASSDOOR = "glassdoor"
|
|||
|
GOOGLE = "google"
|
|||
|
BAYT = "bayt"
|
|||
|
|
|||
|
|
|||
|
class SalarySource(Enum):
|
|||
|
DIRECT_DATA = "direct_data"
|
|||
|
DESCRIPTION = "description"
|
|||
|
|
|||
|
|
|||
|
class ScraperInput(BaseModel):
|
|||
|
site_type: list[Site]
|
|||
|
search_term: str | None = None
|
|||
|
google_search_term: str | None = None
|
|||
|
|
|||
|
location: str | None = None
|
|||
|
country: Country | None = Country.USA
|
|||
|
distance: int | None = None
|
|||
|
is_remote: bool = False
|
|||
|
job_type: JobType | None = None
|
|||
|
easy_apply: bool | None = None
|
|||
|
offset: int = 0
|
|||
|
linkedin_fetch_description: bool = False
|
|||
|
linkedin_company_ids: list[int] | None = None
|
|||
|
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
|||
|
|
|||
|
results_wanted: int = 15
|
|||
|
hours_old: int | None = None
|
|||
|
|
|||
|
|
|||
|
class Scraper(ABC):
|
|||
|
def __init__(
|
|||
|
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
|||
|
):
|
|||
|
self.site = site
|
|||
|
self.proxies = proxies
|
|||
|
self.ca_cert = ca_cert
|
|||
|
|
|||
|
@abstractmethod
|
|||
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|