mirror of https://github.com/Bunsly/JobSpy
315 lines
9.1 KiB
Python
315 lines
9.1 KiB
Python
from __future__ import annotations
|
||
|
||
from abc import ABC, abstractmethod
|
||
from typing import Optional
|
||
from datetime import date
|
||
from enum import Enum
|
||
from pydantic import BaseModel
|
||
|
||
|
||
class JobType(Enum):
|
||
FULL_TIME = (
|
||
"fulltime",
|
||
"períodointegral",
|
||
"estágio/trainee",
|
||
"cunormăîntreagă",
|
||
"tiempocompleto",
|
||
"vollzeit",
|
||
"voltijds",
|
||
"tempointegral",
|
||
"全职",
|
||
"plnýúvazek",
|
||
"fuldtid",
|
||
"دوامكامل",
|
||
"kokopäivätyö",
|
||
"tempsplein",
|
||
"vollzeit",
|
||
"πλήρηςαπασχόληση",
|
||
"teljesmunkaidő",
|
||
"tempopieno",
|
||
"tempsplein",
|
||
"heltid",
|
||
"jornadacompleta",
|
||
"pełnyetat",
|
||
"정규직",
|
||
"100%",
|
||
"全職",
|
||
"งานประจำ",
|
||
"tamzamanlı",
|
||
"повназайнятість",
|
||
"toànthờigian",
|
||
)
|
||
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
|
||
CONTRACT = ("contract", "contractor")
|
||
TEMPORARY = ("temporary",)
|
||
INTERNSHIP = (
|
||
"internship",
|
||
"prácticas",
|
||
"ojt(onthejobtraining)",
|
||
"praktikum",
|
||
"praktik",
|
||
)
|
||
|
||
PER_DIEM = ("perdiem",)
|
||
NIGHTS = ("nights",)
|
||
OTHER = ("other",)
|
||
SUMMER = ("summer",)
|
||
VOLUNTEER = ("volunteer",)
|
||
|
||
|
||
class Country(Enum):
|
||
"""
|
||
Gets the subdomain for Indeed and Glassdoor.
|
||
The second item in the tuple is the subdomain (and API country code if there's a ':' separator) for Indeed
|
||
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
|
||
"""
|
||
|
||
ARGENTINA = ("argentina", "ar", "com.ar")
|
||
AUSTRALIA = ("australia", "au", "com.au")
|
||
AUSTRIA = ("austria", "at", "at")
|
||
BAHRAIN = ("bahrain", "bh")
|
||
BELGIUM = ("belgium", "be", "fr:be")
|
||
BRAZIL = ("brazil", "br", "com.br")
|
||
CANADA = ("canada", "ca", "ca")
|
||
CHILE = ("chile", "cl")
|
||
CHINA = ("china", "cn")
|
||
COLOMBIA = ("colombia", "co")
|
||
COSTARICA = ("costa rica", "cr")
|
||
CZECHREPUBLIC = ("czech republic,czechia", "cz")
|
||
DENMARK = ("denmark", "dk")
|
||
ECUADOR = ("ecuador", "ec")
|
||
EGYPT = ("egypt", "eg")
|
||
FINLAND = ("finland", "fi")
|
||
FRANCE = ("france", "fr", "fr")
|
||
GERMANY = ("germany", "de", "de")
|
||
GREECE = ("greece", "gr")
|
||
HONGKONG = ("hong kong", "hk", "com.hk")
|
||
HUNGARY = ("hungary", "hu")
|
||
INDIA = ("india", "in", "co.in")
|
||
INDONESIA = ("indonesia", "id")
|
||
IRELAND = ("ireland", "ie", "ie")
|
||
ISRAEL = ("israel", "il")
|
||
ITALY = ("italy", "it", "it")
|
||
JAPAN = ("japan", "jp")
|
||
KUWAIT = ("kuwait", "kw")
|
||
LUXEMBOURG = ("luxembourg", "lu")
|
||
MALAYSIA = ("malaysia", "malaysia:my", "com")
|
||
MALTA = ("malta", "malta:mt", "mt")
|
||
MEXICO = ("mexico", "mx", "com.mx")
|
||
MOROCCO = ("morocco", "ma")
|
||
NETHERLANDS = ("netherlands", "nl", "nl")
|
||
NEWZEALAND = ("new zealand", "nz", "co.nz")
|
||
NIGERIA = ("nigeria", "ng")
|
||
NORWAY = ("norway", "no")
|
||
OMAN = ("oman", "om")
|
||
PAKISTAN = ("pakistan", "pk")
|
||
PANAMA = ("panama", "pa")
|
||
PERU = ("peru", "pe")
|
||
PHILIPPINES = ("philippines", "ph")
|
||
POLAND = ("poland", "pl")
|
||
PORTUGAL = ("portugal", "pt")
|
||
QATAR = ("qatar", "qa")
|
||
ROMANIA = ("romania", "ro")
|
||
SAUDIARABIA = ("saudi arabia", "sa")
|
||
SINGAPORE = ("singapore", "sg", "sg")
|
||
SOUTHAFRICA = ("south africa", "za")
|
||
SOUTHKOREA = ("south korea", "kr")
|
||
SPAIN = ("spain", "es", "es")
|
||
SWEDEN = ("sweden", "se")
|
||
SWITZERLAND = ("switzerland", "ch", "de:ch")
|
||
TAIWAN = ("taiwan", "tw")
|
||
THAILAND = ("thailand", "th")
|
||
TURKEY = ("türkiye,turkey", "tr")
|
||
UKRAINE = ("ukraine", "ua")
|
||
UNITEDARABEMIRATES = ("united arab emirates", "ae")
|
||
UK = ("uk,united kingdom", "uk:gb", "co.uk")
|
||
USA = ("usa,us,united states", "www:us", "com")
|
||
URUGUAY = ("uruguay", "uy")
|
||
VENEZUELA = ("venezuela", "ve")
|
||
VIETNAM = ("vietnam", "vn", "com")
|
||
|
||
# internal for ziprecruiter
|
||
US_CANADA = ("usa/ca", "www")
|
||
|
||
# internal for linkedin
|
||
WORLDWIDE = ("worldwide", "www")
|
||
|
||
@property
|
||
def indeed_domain_value(self):
|
||
subdomain, _, api_country_code = self.value[1].partition(":")
|
||
if subdomain and api_country_code:
|
||
return subdomain, api_country_code.upper()
|
||
return self.value[1], self.value[1].upper()
|
||
|
||
@property
|
||
def glassdoor_domain_value(self):
|
||
if len(self.value) == 3:
|
||
subdomain, _, domain = self.value[2].partition(":")
|
||
if subdomain and domain:
|
||
return f"{subdomain}.glassdoor.{domain}"
|
||
else:
|
||
return f"www.glassdoor.{self.value[2]}"
|
||
else:
|
||
raise Exception(f"Glassdoor is not available for {self.name}")
|
||
|
||
def get_glassdoor_url(self):
|
||
return f"https://{self.glassdoor_domain_value}/"
|
||
|
||
@classmethod
|
||
def from_string(cls, country_str: str):
|
||
"""Convert a string to the corresponding Country enum."""
|
||
country_str = country_str.strip().lower()
|
||
for country in cls:
|
||
country_names = country.value[0].split(",")
|
||
if country_str in country_names:
|
||
return country
|
||
valid_countries = [country.value for country in cls]
|
||
raise ValueError(
|
||
f"Invalid country string: '{country_str}'. Valid countries are: {', '.join([country[0] for country in valid_countries])}"
|
||
)
|
||
|
||
|
||
class Location(BaseModel):
|
||
country: Country | str | None = None
|
||
city: Optional[str] = None
|
||
state: Optional[str] = None
|
||
|
||
def display_location(self) -> str:
|
||
location_parts = []
|
||
if self.city:
|
||
location_parts.append(self.city)
|
||
if self.state:
|
||
location_parts.append(self.state)
|
||
if isinstance(self.country, str):
|
||
location_parts.append(self.country)
|
||
elif self.country and self.country not in (
|
||
Country.US_CANADA,
|
||
Country.WORLDWIDE,
|
||
):
|
||
country_name = self.country.value[0]
|
||
if "," in country_name:
|
||
country_name = country_name.split(",")[0]
|
||
if country_name in ("usa", "uk"):
|
||
location_parts.append(country_name.upper())
|
||
else:
|
||
location_parts.append(country_name.title())
|
||
return ", ".join(location_parts)
|
||
|
||
|
||
class CompensationInterval(Enum):
|
||
YEARLY = "yearly"
|
||
MONTHLY = "monthly"
|
||
WEEKLY = "weekly"
|
||
DAILY = "daily"
|
||
HOURLY = "hourly"
|
||
|
||
@classmethod
|
||
def get_interval(cls, pay_period):
|
||
interval_mapping = {
|
||
"YEAR": cls.YEARLY,
|
||
"HOUR": cls.HOURLY,
|
||
}
|
||
if pay_period in interval_mapping:
|
||
return interval_mapping[pay_period].value
|
||
else:
|
||
return cls[pay_period].value if pay_period in cls.__members__ else None
|
||
|
||
|
||
class Compensation(BaseModel):
|
||
interval: Optional[CompensationInterval] = None
|
||
min_amount: float | None = None
|
||
max_amount: float | None = None
|
||
currency: Optional[str] = "USD"
|
||
|
||
|
||
class DescriptionFormat(Enum):
|
||
MARKDOWN = "markdown"
|
||
HTML = "html"
|
||
|
||
|
||
class JobPost(BaseModel):
|
||
id: str | None = None
|
||
title: str
|
||
company_name: str | None
|
||
job_url: str
|
||
job_url_direct: str | None = None
|
||
location: Optional[Location]
|
||
|
||
description: str | None = None
|
||
company_url: str | None = None
|
||
company_url_direct: str | None = None
|
||
|
||
job_type: list[JobType] | None = None
|
||
compensation: Compensation | None = None
|
||
date_posted: date | None = None
|
||
emails: list[str] | None = None
|
||
is_remote: bool | None = None
|
||
listing_type: str | None = None
|
||
|
||
# linkedin specific
|
||
job_level: str | None = None
|
||
|
||
# linkedin and indeed specific
|
||
company_industry: str | None = None
|
||
|
||
# indeed specific
|
||
company_addresses: str | None = None
|
||
company_num_employees: str | None = None
|
||
company_revenue: str | None = None
|
||
company_description: str | None = None
|
||
company_logo: str | None = None
|
||
banner_photo_url: str | None = None
|
||
|
||
# linkedin only atm
|
||
job_function: str | None = None
|
||
|
||
|
||
class JobResponse(BaseModel):
|
||
jobs: list[JobPost] = []
|
||
|
||
|
||
class Site(Enum):
|
||
LINKEDIN = "linkedin"
|
||
INDEED = "indeed"
|
||
ZIP_RECRUITER = "zip_recruiter"
|
||
GLASSDOOR = "glassdoor"
|
||
GOOGLE = "google"
|
||
BAYT = "bayt"
|
||
|
||
|
||
class SalarySource(Enum):
|
||
DIRECT_DATA = "direct_data"
|
||
DESCRIPTION = "description"
|
||
|
||
|
||
class ScraperInput(BaseModel):
|
||
site_type: list[Site]
|
||
search_term: str | None = None
|
||
google_search_term: str | None = None
|
||
|
||
location: str | None = None
|
||
country: Country | None = Country.USA
|
||
distance: int | None = None
|
||
is_remote: bool = False
|
||
job_type: JobType | None = None
|
||
easy_apply: bool | None = None
|
||
offset: int = 0
|
||
linkedin_fetch_description: bool = False
|
||
linkedin_company_ids: list[int] | None = None
|
||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||
|
||
results_wanted: int = 15
|
||
hours_old: int | None = None
|
||
|
||
|
||
class Scraper(ABC):
|
||
def __init__(
|
||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
||
):
|
||
self.site = site
|
||
self.proxies = proxies
|
||
self.ca_cert = ca_cert
|
||
|
||
@abstractmethod
|
||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|