From 2a0cba8c7ec4a4cd4727474c33395ef4417c55b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Llu=C3=ADs=20Salord=20Quetglas?= Date: Thu, 18 Jul 2024 04:05:33 +0200 Subject: [PATCH] FEAT: Optional convertion to annual and know salary source (#170) --- src/jobspy/__init__.py | 10 +++++++--- src/jobspy/scrapers/__init__.py | 3 +++ src/jobspy/scrapers/utils.py | 29 ++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 3e69a60..35399a3 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -10,7 +10,7 @@ from .scrapers.indeed import IndeedScraper from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.glassdoor import GlassdoorScraper from .scrapers.linkedin import LinkedInScraper -from .scrapers import ScraperInput, Site, JobResponse, Country +from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country from .scrapers.exceptions import ( LinkedInException, IndeedException, @@ -36,6 +36,7 @@ def scrape_jobs( linkedin_company_ids: list[int] | None = None, offset: int | None = 0, hours_old: int = None, + enforce_annual_salary: bool = True, verbose: int = 2, **kwargs, ) -> pd.DataFrame: @@ -165,7 +166,8 @@ def scrape_jobs( job_data["min_amount"] = compensation_obj.get("min_amount") job_data["max_amount"] = compensation_obj.get("max_amount") job_data["currency"] = compensation_obj.get("currency", "USD") - if ( + job_data["salary_source"] = SalarySource.DIRECT_DATA.value + if enforce_annual_salary and ( job_data["interval"] and job_data["interval"] != "yearly" and job_data["min_amount"] @@ -180,7 +182,9 @@ def scrape_jobs( job_data["min_amount"], job_data["max_amount"], job_data["currency"], - ) = extract_salary(job_data["description"]) + ) = extract_salary(job_data["description"], enforce_annual_salary=enforce_annual_salary) + job_data["salary_source"] = SalarySource.DESCRIPTION.value + job_df = pd.DataFrame([job_data]) jobs_dfs.append(job_df) diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index af278d7..3f9ab51 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -18,6 +18,9 @@ class Site(Enum): ZIP_RECRUITER = "zip_recruiter" GLASSDOOR = "glassdoor" +class SalarySource(Enum): + DIRECT_DATA = "direct_data" + DESCRIPTION = "description" class ScraperInput(BaseModel): site_type: list[Site] diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 16607b1..56b8bac 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -10,7 +10,7 @@ import numpy as np from markdownify import markdownify as md from requests.adapters import HTTPAdapter, Retry -from ..jobs import JobType +from ..jobs import CompensationInterval, JobType logger = logging.getLogger("JobSpy") logger.propagate = False @@ -193,6 +193,7 @@ def extract_salary( upper_limit=700000, hourly_threshold=350, monthly_threshold=30000, + enforce_annual_salary=False, ): if not salary_str: return None, None, None, None @@ -220,20 +221,30 @@ def extract_salary( # Convert to annual if less than the hourly threshold if min_salary < hourly_threshold: - min_salary = convert_hourly_to_annual(min_salary) + interval = CompensationInterval.HOURLY.value + annual_min_salary = convert_hourly_to_annual(min_salary) if max_salary < hourly_threshold: - max_salary = convert_hourly_to_annual(max_salary) + annual_max_salary = convert_hourly_to_annual(max_salary) elif min_salary < monthly_threshold: - min_salary = convert_monthly_to_annual(min_salary) + interval = CompensationInterval.MONTHLY.value + annual_min_salary = convert_monthly_to_annual(min_salary) if max_salary < monthly_threshold: - max_salary = convert_monthly_to_annual(max_salary) + annual_max_salary = convert_monthly_to_annual(max_salary) + + else: + interval = CompensationInterval.YEARLY.value + annual_min_salary = min_salary + annual_max_salary = max_salary # Ensure salary range is within specified limits if ( - lower_limit <= min_salary <= upper_limit - and lower_limit <= max_salary <= upper_limit - and min_salary < max_salary + lower_limit <= annual_min_salary <= upper_limit + and lower_limit <= annual_max_salary <= upper_limit + and annual_min_salary < annual_max_salary ): - return "yearly", min_salary, max_salary, "USD" + if enforce_annual_salary: + return interval, annual_min_salary, annual_max_salary, "USD" + else: + return interval, min_salary, max_salary, "USD" return None, None, None, None