JobSpy/jobspy/__init__.py

from __future__ import annotations

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple

import pandas as pd

from jobspy.bayt import BaytScraper
from jobspy.glassdoor import Glassdoor
from jobspy.google import Google
from jobspy.indeed import Indeed
from jobspy.linkedin import LinkedIn
from jobspy.model import JobType, Location, JobResponse, Country
from jobspy.model import SalarySource, ScraperInput, Site
from jobspy.util import (
    set_logger_level,
    extract_salary,
    create_logger,
    get_enum_from_value,
    map_str_to_site,
    convert_to_annual,
    desired_order,
)
from jobspy.ziprecruiter import ZipRecruiter


def scrape_jobs(
    site_name: str | list[str] | Site | list[Site] | None = None,
    search_term: str | None = None,
    google_search_term: str | None = None,
    location: str | None = None,
    distance: int | None = 50,
    is_remote: bool = False,
    job_type: str | None = None,
    easy_apply: bool | None = None,
    results_wanted: int = 15,
    country_indeed: str = "usa",
    proxies: list[str] | str | None = None,
    ca_cert: str | None = None,
    description_format: str = "markdown",
    linkedin_fetch_description: bool | None = False,
    linkedin_company_ids: list[int] | None = None,
    offset: int | None = 0,
    hours_old: int = None,
    enforce_annual_salary: bool = False,
    verbose: int = 0,
    **kwargs,
) -> pd.DataFrame:
    """
    Scrapes job data from job boards concurrently
    :return: Pandas DataFrame containing job data
    """
    SCRAPER_MAPPING = {
        Site.LINKEDIN: LinkedIn,
        Site.INDEED: Indeed,
        Site.ZIP_RECRUITER: ZipRecruiter,
        Site.GLASSDOOR: Glassdoor,
        Site.GOOGLE: Google,
        Site.BAYT: BaytScraper,
    }
    set_logger_level(verbose)
    job_type = get_enum_from_value(job_type) if job_type else None

    def get_site_type():
        site_types = list(Site)
        if isinstance(site_name, str):
            site_types = [map_str_to_site(site_name)]
        elif isinstance(site_name, Site):
            site_types = [site_name]
        elif isinstance(site_name, list):
            site_types = [
                map_str_to_site(site) if isinstance(site, str) else site
                for site in site_name
            ]
        return site_types

    country_enum = Country.from_string(country_indeed)

    scraper_input = ScraperInput(
        site_type=get_site_type(),
        country=country_enum,
        search_term=search_term,
        google_search_term=google_search_term,
        location=location,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
        easy_apply=easy_apply,
        description_format=description_format,
        linkedin_fetch_description=linkedin_fetch_description,
        results_wanted=results_wanted,
        linkedin_company_ids=linkedin_company_ids,
        offset=offset,
        hours_old=hours_old,
    )

    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
        scraped_data: JobResponse = scraper.scrape(scraper_input)
        cap_name = site.value.capitalize()
        site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
        create_logger(site_name).info(f"finished scraping")
        return site.value, scraped_data

    site_to_jobs_dict = {}

    def worker(site):
        site_val, scraped_info = scrape_site(site)
        return site_val, scraped_info

    with ThreadPoolExecutor() as executor:
        future_to_site = {
            executor.submit(worker, site): site for site in scraper_input.site_type
        }

        for future in as_completed(future_to_site):
            site_value, scraped_data = future.result()
            site_to_jobs_dict[site_value] = scraped_data

    jobs_dfs: list[pd.DataFrame] = []

    for site, job_response in site_to_jobs_dict.items():
        for job in job_response.jobs:
            job_data = job.dict()
            job_url = job_data["job_url"]
            job_data["site"] = site
            job_data["company"] = job_data["company_name"]
            job_data["job_type"] = (
                ", ".join(job_type.value[0] for job_type in job_data["job_type"])
                if job_data["job_type"]
                else None
            )
            job_data["emails"] = (
                ", ".join(job_data["emails"]) if job_data["emails"] else None
            )
            if job_data["location"]:
                job_data["location"] = Location(
                    **job_data["location"]
                ).display_location()

            compensation_obj = job_data.get("compensation")
            if compensation_obj and isinstance(compensation_obj, dict):
                job_data["interval"] = (
                    compensation_obj.get("interval").value
                    if compensation_obj.get("interval")
                    else None
                )
                job_data["min_amount"] = compensation_obj.get("min_amount")
                job_data["max_amount"] = compensation_obj.get("max_amount")
                job_data["currency"] = compensation_obj.get("currency", "USD")
                job_data["salary_source"] = SalarySource.DIRECT_DATA.value
                if enforce_annual_salary and (
                    job_data["interval"]
                    and job_data["interval"] != "yearly"
                    and job_data["min_amount"]
                    and job_data["max_amount"]
                ):
                    convert_to_annual(job_data)

            else:
                if country_enum == Country.USA:
                    (
                        job_data["interval"],
                        job_data["min_amount"],
                        job_data["max_amount"],
                        job_data["currency"],
                    ) = extract_salary(
                        job_data["description"],
                        enforce_annual_salary=enforce_annual_salary,
                    )
                    job_data["salary_source"] = SalarySource.DESCRIPTION.value

            job_data["salary_source"] = (
                job_data["salary_source"]
                if "min_amount" in job_data and job_data["min_amount"]
                else None
            )
            job_df = pd.DataFrame([job_data])
            jobs_dfs.append(job_df)

    if jobs_dfs:
        # Step 1: Filter out all-NA columns from each DataFrame before concatenation
        filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]

        # Step 2: Concatenate the filtered DataFrames
        jobs_df = pd.concat(filtered_dfs, ignore_index=True)

        # Step 3: Ensure all desired columns are present, adding missing ones as empty
        for column in desired_order:
            if column not in jobs_df.columns:
                jobs_df[column] = None  # Add missing columns as empty

        # Reorder the DataFrame according to the desired order
        jobs_df = jobs_df[desired_order]

        # Step 4: Sort the DataFrame as required
        return jobs_df.sort_values(
            by=["site", "date_posted"], ascending=[True, False]
        ).reset_index(drop=True)
    else:
        return pd.DataFrame()
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`from __future__ import annotations`

enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`from concurrent.futures import ThreadPoolExecutor, as_completed`
refactor:organize code 2025-02-21 12:14:55 -08:00			`from typing import Tuple`

			`import pandas as pd`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
refactor:organize code 2025-02-21 12:14:55 -08:00			`from jobspy.bayt import BaytScraper`
			`from jobspy.glassdoor import Glassdoor`
			`from jobspy.google import Google`
			`from jobspy.indeed import Indeed`
			`from jobspy.linkedin import LinkedIn`
			`from jobspy.model import JobType, Location, JobResponse, Country`
			`from jobspy.model import SalarySource, ScraperInput, Site`
			`from jobspy.util import (`
			`set_logger_level,`
			`extract_salary,`
			`create_logger,`
			`get_enum_from_value,`
			`map_str_to_site,`
			`convert_to_annual,`
			`desired_order,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`)`
refactor:organize code 2025-02-21 12:14:55 -08:00			`from jobspy.ziprecruiter import ZipRecruiter`
Library Migration (#31) 2023-09-03 07:29:25 -07:00

			`def scrape_jobs(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_name: str \| list[str] \| Site \| list[Site] \| None = None,`
			`search_term: str \| None = None,`
fix(google): search (#216) 2024-10-25 12:54:14 -07:00			`google_search_term: str \| None = None,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`location: str \| None = None,`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`distance: int \| None = 50,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`is_remote: bool = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`job_type: str \| None = None,`
			`easy_apply: bool \| None = None,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`results_wanted: int = 15,`
			`country_indeed: str = "usa",`
enh: proxies (#157) * enh: proxies * enh: proxies 2024-05-25 12:04:09 -07:00			`proxies: list[str] \| str \| None = None,`
FEATURE: Add the "ca_cert" setting for providing a Certification Authority certificate in order to use proxies requiring it. (#204) 2024-10-08 15:46:46 -07:00			`ca_cert: str \| None = None,`
Description format (#107) 2024-02-14 14:04:23 -08:00			`description_format: str = "markdown",`
			`linkedin_fetch_description: bool \| None = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`linkedin_company_ids: list[int] \| None = None,`
			`offset: int \| None = 0,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`hours_old: int = None,`
lock file (#173) 2024-07-17 19:21:22 -07:00			`enforce_annual_salary: bool = False,`
enh:remove log by default 2025-02-21 10:29:28 -08:00			`verbose: int = 0,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`**kwargs,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`) -> pd.DataFrame:`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`"""`
refactor:organize code 2025-02-21 12:14:55 -08:00			`Scrapes job data from job boards concurrently`
			`:return: Pandas DataFrame containing job data`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`"""`
Description format (#107) 2024-02-14 14:04:23 -08:00			`SCRAPER_MAPPING = {`
refactor:organize code 2025-02-21 12:14:55 -08:00			`Site.LINKEDIN: LinkedIn,`
			`Site.INDEED: Indeed,`
			`Site.ZIP_RECRUITER: ZipRecruiter,`
			`Site.GLASSDOOR: Glassdoor,`
			`Site.GOOGLE: Google,`
Added Bayt Scraper integration 2025-02-21 03:31:29 -08:00			`Site.BAYT: BaytScraper,`
Description format (#107) 2024-02-14 14:04:23 -08:00			`}`
feat: Adjust log verbosity via verbose arg (#128) 2024-03-11 12:38:44 -07:00			`set_logger_level(verbose)`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`job_type = get_enum_from_value(job_type) if job_type else None`
fix: job type param bug 2023-09-21 15:42:24 -07:00
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`def get_site_type():`
			`site_types = list(Site)`
			`if isinstance(site_name, str):`
Description format (#107) 2024-02-14 14:04:23 -08:00			`site_types = [map_str_to_site(site_name)]`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`elif isinstance(site_name, Site):`
			`site_types = [site_name]`
			`elif isinstance(site_name, list):`
			`site_types = [`
Description format (#107) 2024-02-14 14:04:23 -08:00			`map_str_to_site(site) if isinstance(site, str) else site`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`for site in site_name`
			`]`
			`return site_types`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`country_enum = Country.from_string(country_indeed)`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`scraper_input = ScraperInput(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_type=get_site_type(),`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`country=country_enum,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`search_term=search_term,`
fix(google): search (#216) 2024-10-25 12:54:14 -07:00			`google_search_term=google_search_term,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`location=location,`
			`distance=distance,`
			`is_remote=is_remote,`
			`job_type=job_type,`
			`easy_apply=easy_apply,`
Description format (#107) 2024-02-14 14:04:23 -08:00			`description_format=description_format,`
			`linkedin_fetch_description=linkedin_fetch_description,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`results_wanted=results_wanted,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`linkedin_company_ids=linkedin_company_ids,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`offset=offset,`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`hours_old=hours_old,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`)`

			`def scrape_site(site: Site) -> Tuple[str, JobResponse]:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`scraper_class = SCRAPER_MAPPING[site]`
FEATURE: Add the "ca_cert" setting for providing a Certification Authority certificate in order to use proxies requiring it. (#204) 2024-10-08 15:46:46 -07:00			`scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)`
Description format (#107) 2024-02-14 14:04:23 -08:00			`scraped_data: JobResponse = scraper.scrape(scraper_input)`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`cap_name = site.value.capitalize()`
			`site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name`
indeed:remove tpe (#210) 2024-10-19 16:01:59 -07:00			`create_logger(site_name).info(f"finished scraping")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`return site.value, scraped_data`

Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict = {}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`def worker(site):`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`site_val, scraped_info = scrape_site(site)`
			`return site_val, scraped_info`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`with ThreadPoolExecutor() as executor:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`future_to_site = {`
			`executor.submit(worker, site): site for site in scraper_input.site_type`
			`}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`for future in as_completed(future_to_site):`
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`site_value, scraped_data = future.result()`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict[site_value] = scraped_data`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`jobs_dfs: list[pd.DataFrame] = []`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`for site, job_response in site_to_jobs_dict.items():`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`for job in job_response.jobs:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data = job.dict()`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`job_url = job_data["job_url"]`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["site"] = site`
			`job_data["company"] = job_data["company_name"]`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`job_data["job_type"] = (`
			`", ".join(job_type.value[0] for job_type in job_data["job_type"])`
			`if job_data["job_type"]`
			`else None`
			`)`
			`job_data["emails"] = (`
			`", ".join(job_data["emails"]) if job_data["emails"] else None`
			`)`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`if job_data["location"]:`
			`job_data["location"] = Location(`
			`**job_data["location"]`
			`).display_location()`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`compensation_obj = job_data.get("compensation")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`if compensation_obj and isinstance(compensation_obj, dict):`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["interval"] = (`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`compensation_obj.get("interval").value`
			`if compensation_obj.get("interval")`
			`else None`
			`)`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["min_amount"] = compensation_obj.get("min_amount")`
			`job_data["max_amount"] = compensation_obj.get("max_amount")`
			`job_data["currency"] = compensation_obj.get("currency", "USD")`
FEAT: Optional convertion to annual and know salary source (#170) 2024-07-17 19:05:33 -07:00			`job_data["salary_source"] = SalarySource.DIRECT_DATA.value`
			`if enforce_annual_salary and (`
Salary parse (#163) 2024-06-09 15:45:38 -07:00			`job_data["interval"]`
			`and job_data["interval"] != "yearly"`
			`and job_data["min_amount"]`
			`and job_data["max_amount"]`
			`):`
			`convert_to_annual(job_data)`

Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Salary parse (#163) 2024-06-09 15:45:38 -07:00			`if country_enum == Country.USA:`
			`(`
			`job_data["interval"],`
			`job_data["min_amount"],`
			`job_data["max_amount"],`
			`job_data["currency"],`
lock file (#173) 2024-07-17 19:21:22 -07:00			`) = extract_salary(`
			`job_data["description"],`
			`enforce_annual_salary=enforce_annual_salary,`
			`)`
FEAT: Optional convertion to annual and know salary source (#170) 2024-07-17 19:05:33 -07:00			`job_data["salary_source"] = SalarySource.DESCRIPTION.value`

lock file (#173) 2024-07-17 19:21:22 -07:00			`job_data["salary_source"] = (`
fix:key error (#176) 2024-07-21 11:05:18 -07:00			`job_data["salary_source"]`
			`if "min_amount" in job_data and job_data["min_amount"]`
			`else None`
lock file (#173) 2024-07-17 19:21:22 -07:00			`)`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_df = pd.DataFrame([job_data])`
			`jobs_dfs.append(job_df)`

			`if jobs_dfs:`
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 1: Filter out all-NA columns from each DataFrame before concatenation`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]`

Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 2: Concatenate the filtered DataFrames`
			`jobs_df = pd.concat(filtered_dfs, ignore_index=True)`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 3: Ensure all desired columns are present, adding missing ones as empty`
			`for column in desired_order:`
			`if column not in jobs_df.columns:`
			`jobs_df[column] = None # Add missing columns as empty`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Reorder the DataFrame according to the desired order`
			`jobs_df = jobs_df[desired_order]`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 4: Sort the DataFrame as required`
fix(google): search (#216) 2024-10-25 12:54:14 -07:00			`return jobs_df.sort_values(`
			`by=["site", "date_posted"], ascending=[True, False]`
			`).reset_index(drop=True)`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Description format (#107) 2024-02-14 14:04:23 -08:00			`return pd.DataFrame()`