FEAT: Optional convertion to annual and know salary source (#170)

pull/173/head
Lluís Salord Quetglas 2024-07-18 04:05:33 +02:00 committed by GitHub
parent de70189fa2
commit 2a0cba8c7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 30 additions and 12 deletions

View File

@ -10,7 +10,7 @@ from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.linkedin import LinkedInScraper from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import ( from .scrapers.exceptions import (
LinkedInException, LinkedInException,
IndeedException, IndeedException,
@ -36,6 +36,7 @@ def scrape_jobs(
linkedin_company_ids: list[int] | None = None, linkedin_company_ids: list[int] | None = None,
offset: int | None = 0, offset: int | None = 0,
hours_old: int = None, hours_old: int = None,
enforce_annual_salary: bool = True,
verbose: int = 2, verbose: int = 2,
**kwargs, **kwargs,
) -> pd.DataFrame: ) -> pd.DataFrame:
@ -165,7 +166,8 @@ def scrape_jobs(
job_data["min_amount"] = compensation_obj.get("min_amount") job_data["min_amount"] = compensation_obj.get("min_amount")
job_data["max_amount"] = compensation_obj.get("max_amount") job_data["max_amount"] = compensation_obj.get("max_amount")
job_data["currency"] = compensation_obj.get("currency", "USD") job_data["currency"] = compensation_obj.get("currency", "USD")
if ( job_data["salary_source"] = SalarySource.DIRECT_DATA.value
if enforce_annual_salary and (
job_data["interval"] job_data["interval"]
and job_data["interval"] != "yearly" and job_data["interval"] != "yearly"
and job_data["min_amount"] and job_data["min_amount"]
@ -180,7 +182,9 @@ def scrape_jobs(
job_data["min_amount"], job_data["min_amount"],
job_data["max_amount"], job_data["max_amount"],
job_data["currency"], job_data["currency"],
) = extract_salary(job_data["description"]) ) = extract_salary(job_data["description"], enforce_annual_salary=enforce_annual_salary)
job_data["salary_source"] = SalarySource.DESCRIPTION.value
job_df = pd.DataFrame([job_data]) job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df) jobs_dfs.append(job_df)

View File

@ -18,6 +18,9 @@ class Site(Enum):
ZIP_RECRUITER = "zip_recruiter" ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor" GLASSDOOR = "glassdoor"
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
site_type: list[Site] site_type: list[Site]

View File

@ -10,7 +10,7 @@ import numpy as np
from markdownify import markdownify as md from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType from ..jobs import CompensationInterval, JobType
logger = logging.getLogger("JobSpy") logger = logging.getLogger("JobSpy")
logger.propagate = False logger.propagate = False
@ -193,6 +193,7 @@ def extract_salary(
upper_limit=700000, upper_limit=700000,
hourly_threshold=350, hourly_threshold=350,
monthly_threshold=30000, monthly_threshold=30000,
enforce_annual_salary=False,
): ):
if not salary_str: if not salary_str:
return None, None, None, None return None, None, None, None
@ -220,20 +221,30 @@ def extract_salary(
# Convert to annual if less than the hourly threshold # Convert to annual if less than the hourly threshold
if min_salary < hourly_threshold: if min_salary < hourly_threshold:
min_salary = convert_hourly_to_annual(min_salary) interval = CompensationInterval.HOURLY.value
annual_min_salary = convert_hourly_to_annual(min_salary)
if max_salary < hourly_threshold: if max_salary < hourly_threshold:
max_salary = convert_hourly_to_annual(max_salary) annual_max_salary = convert_hourly_to_annual(max_salary)
elif min_salary < monthly_threshold: elif min_salary < monthly_threshold:
min_salary = convert_monthly_to_annual(min_salary) interval = CompensationInterval.MONTHLY.value
annual_min_salary = convert_monthly_to_annual(min_salary)
if max_salary < monthly_threshold: if max_salary < monthly_threshold:
max_salary = convert_monthly_to_annual(max_salary) annual_max_salary = convert_monthly_to_annual(max_salary)
else:
interval = CompensationInterval.YEARLY.value
annual_min_salary = min_salary
annual_max_salary = max_salary
# Ensure salary range is within specified limits # Ensure salary range is within specified limits
if ( if (
lower_limit <= min_salary <= upper_limit lower_limit <= annual_min_salary <= upper_limit
and lower_limit <= max_salary <= upper_limit and lower_limit <= annual_max_salary <= upper_limit
and min_salary < max_salary and annual_min_salary < annual_max_salary
): ):
return "yearly", min_salary, max_salary, "USD" if enforce_annual_salary:
return interval, annual_min_salary, annual_max_salary, "USD"
else:
return interval, min_salary, max_salary, "USD"
return None, None, None, None return None, None, None, None