JobSpy/jobspy/model.py

315 lines
9.1 KiB
Python
Raw Normal View History

from __future__ import annotations
2025-02-21 12:14:55 -08:00
from abc import ABC, abstractmethod
2024-01-28 19:50:41 -08:00
from typing import Optional
2023-08-31 08:29:43 -07:00
from datetime import date
2023-07-06 16:44:38 -07:00
from enum import Enum
2024-01-28 19:50:41 -08:00
from pydantic import BaseModel
2023-07-11 06:24:59 -07:00
2023-07-06 16:44:38 -07:00
class JobType(Enum):
2023-09-05 10:17:22 -07:00
FULL_TIME = (
"fulltime",
"períodointegral",
"estágio/trainee",
"cunormăîntreagă",
"tiempocompleto",
"vollzeit",
"voltijds",
"tempointegral",
"全职",
"plnýúvazek",
"fuldtid",
"دوامكامل",
"kokopäivätyö",
"tempsplein",
"vollzeit",
"πλήρηςαπασχόληση",
"teljesmunkaidő",
"tempopieno",
"tempsplein",
"heltid",
"jornadacompleta",
"pełnyetat",
"정규직",
"100%",
"全職",
"งานประจำ",
"tamzamanlı",
"повназайнятість",
"toànthờigian",
)
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
2023-09-05 10:17:22 -07:00
CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",)
INTERNSHIP = (
"internship",
"prácticas",
"ojt(onthejobtraining)",
"praktikum",
"praktik",
)
2023-09-05 10:17:22 -07:00
PER_DIEM = ("perdiem",)
NIGHTS = ("nights",)
OTHER = ("other",)
SUMMER = ("summer",)
VOLUNTEER = ("volunteer",)
class Country(Enum):
2023-12-02 00:42:54 -08:00
"""
Gets the subdomain for Indeed and Glassdoor.
2024-03-08 23:40:01 -08:00
The second item in the tuple is the subdomain (and API country code if there's a ':' separator) for Indeed
2023-12-02 00:42:54 -08:00
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""
ARGENTINA = ("argentina", "ar", "com.ar")
2023-10-30 17:57:36 -07:00
AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at")
2023-09-05 10:17:22 -07:00
BAHRAIN = ("bahrain", "bh")
2023-12-02 00:42:54 -08:00
BELGIUM = ("belgium", "be", "fr:be")
2023-10-30 17:57:36 -07:00
BRAZIL = ("brazil", "br", "com.br")
CANADA = ("canada", "ca", "ca")
2023-09-05 10:17:22 -07:00
CHILE = ("chile", "cl")
CHINA = ("china", "cn")
COLOMBIA = ("colombia", "co")
COSTARICA = ("costa rica", "cr")
2023-12-02 00:42:54 -08:00
CZECHREPUBLIC = ("czech republic,czechia", "cz")
2023-09-05 10:17:22 -07:00
DENMARK = ("denmark", "dk")
ECUADOR = ("ecuador", "ec")
EGYPT = ("egypt", "eg")
FINLAND = ("finland", "fi")
2023-10-30 17:57:36 -07:00
FRANCE = ("france", "fr", "fr")
GERMANY = ("germany", "de", "de")
2023-09-05 10:17:22 -07:00
GREECE = ("greece", "gr")
2023-10-30 17:57:36 -07:00
HONGKONG = ("hong kong", "hk", "com.hk")
2023-09-05 10:17:22 -07:00
HUNGARY = ("hungary", "hu")
2023-10-30 17:57:36 -07:00
INDIA = ("india", "in", "co.in")
2023-09-05 10:17:22 -07:00
INDONESIA = ("indonesia", "id")
2023-10-30 17:57:36 -07:00
IRELAND = ("ireland", "ie", "ie")
2023-09-05 10:17:22 -07:00
ISRAEL = ("israel", "il")
2023-10-30 17:57:36 -07:00
ITALY = ("italy", "it", "it")
2023-09-05 10:17:22 -07:00
JAPAN = ("japan", "jp")
KUWAIT = ("kuwait", "kw")
LUXEMBOURG = ("luxembourg", "lu")
2024-08-03 20:48:53 -07:00
MALAYSIA = ("malaysia", "malaysia:my", "com")
MALTA = ("malta", "malta:mt", "mt")
2023-10-30 17:57:36 -07:00
MEXICO = ("mexico", "mx", "com.mx")
2023-09-05 10:17:22 -07:00
MOROCCO = ("morocco", "ma")
2023-10-30 17:57:36 -07:00
NETHERLANDS = ("netherlands", "nl", "nl")
NEWZEALAND = ("new zealand", "nz", "co.nz")
2023-09-05 10:17:22 -07:00
NIGERIA = ("nigeria", "ng")
NORWAY = ("norway", "no")
OMAN = ("oman", "om")
PAKISTAN = ("pakistan", "pk")
PANAMA = ("panama", "pa")
PERU = ("peru", "pe")
PHILIPPINES = ("philippines", "ph")
POLAND = ("poland", "pl")
PORTUGAL = ("portugal", "pt")
QATAR = ("qatar", "qa")
ROMANIA = ("romania", "ro")
SAUDIARABIA = ("saudi arabia", "sa")
2023-10-30 17:57:36 -07:00
SINGAPORE = ("singapore", "sg", "sg")
2023-09-05 10:17:22 -07:00
SOUTHAFRICA = ("south africa", "za")
SOUTHKOREA = ("south korea", "kr")
2023-10-30 17:57:36 -07:00
SPAIN = ("spain", "es", "es")
2023-09-05 10:17:22 -07:00
SWEDEN = ("sweden", "se")
2023-10-30 17:57:36 -07:00
SWITZERLAND = ("switzerland", "ch", "de:ch")
2023-09-05 10:17:22 -07:00
TAIWAN = ("taiwan", "tw")
THAILAND = ("thailand", "th")
2024-10-01 23:31:00 -07:00
TURKEY = ("türkiye,turkey", "tr")
2023-09-05 10:17:22 -07:00
UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae")
2024-03-08 23:40:01 -08:00
UK = ("uk,united kingdom", "uk:gb", "co.uk")
USA = ("usa,us,united states", "www:us", "com")
2023-09-05 10:17:22 -07:00
URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve")
2024-03-04 15:35:57 -08:00
VIETNAM = ("vietnam", "vn", "com")
2023-09-05 10:17:22 -07:00
# internal for ziprecruiter
US_CANADA = ("usa/ca", "www")
2023-11-27 13:00:36 -08:00
# internal for linkedin
2023-09-05 10:17:22 -07:00
WORLDWIDE = ("worldwide", "www")
2023-10-30 17:57:36 -07:00
@property
def indeed_domain_value(self):
2024-03-08 23:40:01 -08:00
subdomain, _, api_country_code = self.value[1].partition(":")
if subdomain and api_country_code:
return subdomain, api_country_code.upper()
return self.value[1], self.value[1].upper()
2023-09-05 10:17:22 -07:00
@property
2023-10-30 17:57:36 -07:00
def glassdoor_domain_value(self):
if len(self.value) == 3:
subdomain, _, domain = self.value[2].partition(":")
if subdomain and domain:
return f"{subdomain}.glassdoor.{domain}"
else:
return f"www.glassdoor.{self.value[2]}"
else:
raise Exception(f"Glassdoor is not available for {self.name}")
def get_glassdoor_url(self):
2023-10-30 17:57:36 -07:00
return f"https://{self.glassdoor_domain_value}/"
2023-09-05 10:17:22 -07:00
@classmethod
def from_string(cls, country_str: str):
"""Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower()
for country in cls:
country_names = country.value[0].split(",")
2023-12-02 00:42:54 -08:00
if country_str in country_names:
2023-09-05 10:17:22 -07:00
return country
valid_countries = [country.value for country in cls]
raise ValueError(
2023-10-30 17:57:36 -07:00
f"Invalid country string: '{country_str}'. Valid countries are: {', '.join([country[0] for country in valid_countries])}"
2023-09-05 10:17:22 -07:00
)
2023-07-06 16:44:38 -07:00
class Location(BaseModel):
2024-03-08 23:40:01 -08:00
country: Country | str | None = None
2023-09-05 10:17:22 -07:00
city: Optional[str] = None
2023-09-03 07:29:25 -07:00
state: Optional[str] = None
2023-07-06 16:44:38 -07:00
2023-09-05 10:17:22 -07:00
def display_location(self) -> str:
location_parts = []
if self.city:
location_parts.append(self.city)
if self.state:
location_parts.append(self.state)
2024-03-08 23:40:01 -08:00
if isinstance(self.country, str):
location_parts.append(self.country)
elif self.country and self.country not in (
Country.US_CANADA,
Country.WORLDWIDE,
):
2023-12-02 00:42:54 -08:00
country_name = self.country.value[0]
if "," in country_name:
country_name = country_name.split(",")[0]
if country_name in ("usa", "uk"):
location_parts.append(country_name.upper())
2023-09-05 10:17:22 -07:00
else:
2023-12-02 00:42:54 -08:00
location_parts.append(country_name.title())
2023-09-05 10:17:22 -07:00
return ", ".join(location_parts)
2023-07-06 16:44:38 -07:00
2023-07-08 04:57:36 -07:00
class CompensationInterval(Enum):
YEARLY = "yearly"
MONTHLY = "monthly"
WEEKLY = "weekly"
DAILY = "daily"
HOURLY = "hourly"
2023-12-02 00:42:54 -08:00
@classmethod
def get_interval(cls, pay_period):
2024-02-09 10:05:10 -08:00
interval_mapping = {
"YEAR": cls.YEARLY,
"HOUR": cls.HOURLY,
}
if pay_period in interval_mapping:
return interval_mapping[pay_period].value
else:
return cls[pay_period].value if pay_period in cls.__members__ else None
2023-12-02 00:42:54 -08:00
2023-07-08 04:57:36 -07:00
2023-07-06 16:44:38 -07:00
class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None
2024-02-09 10:05:10 -08:00
min_amount: float | None = None
max_amount: float | None = None
2023-09-05 10:17:22 -07:00
currency: Optional[str] = "USD"
2023-07-06 16:44:38 -07:00
2024-02-14 14:04:23 -08:00
class DescriptionFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"
2023-07-06 16:44:38 -07:00
class JobPost(BaseModel):
id: str | None = None
2023-07-06 16:44:38 -07:00
title: str
2024-03-08 23:40:01 -08:00
company_name: str | None
2023-07-11 03:42:20 -07:00
job_url: str
2024-03-08 23:40:01 -08:00
job_url_direct: str | None = None
2023-09-03 07:29:25 -07:00
location: Optional[Location]
2023-07-11 03:42:20 -07:00
description: str | None = None
company_url: str | None = None
2024-03-08 23:40:01 -08:00
company_url_direct: str | None = None
job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None
emails: list[str] | None = None
is_remote: bool | None = None
2024-07-15 18:30:04 -07:00
listing_type: str | None = None
2024-03-08 23:40:01 -08:00
# linkedin specific
job_level: str | None = None
# linkedin and indeed specific
company_industry: str | None = None
2024-03-08 23:40:01 -08:00
# indeed specific
company_addresses: str | None = None
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None
2024-10-24 13:19:40 -07:00
company_logo: str | None = None
2024-03-08 23:40:01 -08:00
banner_photo_url: str | None = None
2023-07-06 17:12:01 -07:00
2024-05-28 14:01:29 -07:00
# linkedin only atm
job_function: str | None = None
2023-07-06 17:12:01 -07:00
class JobResponse(BaseModel):
2023-07-11 10:30:13 -07:00
jobs: list[JobPost] = []
2025-02-21 12:14:55 -08:00
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
BAYT = "bayt"
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None
location: str | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15
hours_old: int | None = None
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...