JobSpy/src/jobspy/__init__.py

import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed

from .jobs import JobType, Location
from .scrapers.utils import logger
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
    LinkedInException,
    IndeedException,
    ZipRecruiterException,
    GlassdoorException,
)


def scrape_jobs(
    site_name: str | list[str] | Site | list[Site] | None = None,
    search_term: str | None = None,
    location: str | None = None,
    distance: int | None = 50,
    is_remote: bool = False,
    job_type: str | None = None,
    easy_apply: bool | None = None,
    results_wanted: int = 15,
    country_indeed: str = "usa",
    hyperlinks: bool = False,
    proxy: str | None = None,
    description_format: str = "markdown",
    linkedin_fetch_description: bool | None = False,
    linkedin_company_ids: list[int] | None = None,
    offset: int | None = 0,
    hours_old: int = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Simultaneously scrapes job data from multiple job sites.
    :return: results_wanted: pandas dataframe containing job data
    """
    SCRAPER_MAPPING = {
        Site.LINKEDIN: LinkedInScraper,
        Site.INDEED: IndeedScraper,
        Site.ZIP_RECRUITER: ZipRecruiterScraper,
        Site.GLASSDOOR: GlassdoorScraper,
    }

    def map_str_to_site(site_name: str) -> Site:
        return Site[site_name.upper()]

    def get_enum_from_value(value_str):
        for job_type in JobType:
            if value_str in job_type.value:
                return job_type
        raise Exception(f"Invalid job type: {value_str}")

    job_type = get_enum_from_value(job_type) if job_type else None

    def get_site_type():
        site_types = list(Site)
        if isinstance(site_name, str):
            site_types = [map_str_to_site(site_name)]
        elif isinstance(site_name, Site):
            site_types = [site_name]
        elif isinstance(site_name, list):
            site_types = [
                map_str_to_site(site) if isinstance(site, str) else site
                for site in site_name
            ]
        return site_types
    country_enum = Country.from_string(country_indeed)

    scraper_input = ScraperInput(
        site_type=get_site_type(),
        country=country_enum,
        search_term=search_term,
        location=location,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
        easy_apply=easy_apply,
        description_format=description_format,
        linkedin_fetch_description=linkedin_fetch_description,
        results_wanted=results_wanted,
        linkedin_company_ids=linkedin_company_ids,
        offset=offset,
        hours_old=hours_old
    )

    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class(proxy=proxy)
        scraped_data: JobResponse = scraper.scrape(scraper_input)
        site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()
        logger.info(f"{site_name} finished scraping")
        return site.value, scraped_data

    site_to_jobs_dict = {}

    def worker(site):
        site_val, scraped_info = scrape_site(site)
        return site_val, scraped_info

    with ThreadPoolExecutor() as executor:
        future_to_site = {
            executor.submit(worker, site): site for site in scraper_input.site_type
        }

        for future in as_completed(future_to_site):
            site_value, scraped_data = future.result()
            site_to_jobs_dict[site_value] = scraped_data

    jobs_dfs: list[pd.DataFrame] = []

    for site, job_response in site_to_jobs_dict.items():
        for job in job_response.jobs:
            job_data = job.dict()
            job_data[
                "job_url_hyper"
            ] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
            job_data["site"] = site
            job_data["company"] = job_data["company_name"]
            job_data["job_type"] = (
                ", ".join(job_type.value[0] for job_type in job_data["job_type"])
                if job_data["job_type"]
                else None
            )
            job_data["emails"] = (
                ", ".join(job_data["emails"]) if job_data["emails"] else None
            )
            if job_data["location"]:
                job_data["location"] = Location(
                    **job_data["location"]
                ).display_location()

            compensation_obj = job_data.get("compensation")
            if compensation_obj and isinstance(compensation_obj, dict):
                job_data["interval"] = (
                    compensation_obj.get("interval").value
                    if compensation_obj.get("interval")
                    else None
                )
                job_data["min_amount"] = compensation_obj.get("min_amount")
                job_data["max_amount"] = compensation_obj.get("max_amount")
                job_data["currency"] = compensation_obj.get("currency", "USD")
            else:
                job_data["interval"] = None
                job_data["min_amount"] = None
                job_data["max_amount"] = None
                job_data["currency"] = None

            job_df = pd.DataFrame([job_data])
            jobs_dfs.append(job_df)

    if jobs_dfs:
        # Step 1: Filter out all-NA columns from each DataFrame before concatenation
        filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
        
        # Step 2: Concatenate the filtered DataFrames
        jobs_df = pd.concat(filtered_dfs, ignore_index=True)
        
        # Desired column order
        desired_order = [
            "site",
            "job_url_hyper" if hyperlinks else "job_url",
            "job_url_direct",
            "title",
            "company",
            "location",
            "job_type",
            "date_posted",
            "interval",
            "min_amount",
            "max_amount",
            "currency",
            "is_remote",
            "emails",
            "description",

            "company_url",
            "company_url_direct",
            "company_addresses",
            "company_industry",
            "company_num_employees",
            "company_revenue",
            "company_description",
            "logo_photo_url",
            "banner_photo_url",
            "ceo_name",
            "ceo_photo_url",
        ]
        
        # Step 3: Ensure all desired columns are present, adding missing ones as empty
        for column in desired_order:
            if column not in jobs_df.columns:
                jobs_df[column] = None  # Add missing columns as empty
        
        # Reorder the DataFrame according to the desired order
        jobs_df = jobs_df[desired_order]
        
        # Step 4: Sort the DataFrame as required
        return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
    else:
        return pd.DataFrame()
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`import pandas as pd`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`from typing import Tuple`
enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`from concurrent.futures import ThreadPoolExecutor, as_completed`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`from .jobs import JobType, Location`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`from .scrapers.utils import logger`
proj structure 2023-09-03 10:05:50 -07:00			`from .scrapers.indeed import IndeedScraper`
			`from .scrapers.ziprecruiter import ZipRecruiterScraper`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`from .scrapers.glassdoor import GlassdoorScraper`
proj structure 2023-09-03 10:05:50 -07:00			`from .scrapers.linkedin import LinkedInScraper`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`from .scrapers import ScraperInput, Site, JobResponse, Country`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`from .scrapers.exceptions import (`
			`LinkedInException,`
			`IndeedException,`
			`ZipRecruiterException,`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`GlassdoorException,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`)`
Library Migration (#31) 2023-09-03 07:29:25 -07:00

			`def scrape_jobs(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_name: str \| list[str] \| Site \| list[Site] \| None = None,`
			`search_term: str \| None = None,`
			`location: str \| None = None,`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`distance: int \| None = 50,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`is_remote: bool = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`job_type: str \| None = None,`
			`easy_apply: bool \| None = None,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`results_wanted: int = 15,`
			`country_indeed: str = "usa",`
			`hyperlinks: bool = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`proxy: str \| None = None,`
Description format (#107) 2024-02-14 14:04:23 -08:00			`description_format: str = "markdown",`
			`linkedin_fetch_description: bool \| None = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`linkedin_company_ids: list[int] \| None = None,`
			`offset: int \| None = 0,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`hours_old: int = None,`
			`**kwargs,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`) -> pd.DataFrame:`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`"""`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`Simultaneously scrapes job data from multiple job sites.`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`:return: results_wanted: pandas dataframe containing job data`
			`"""`
Description format (#107) 2024-02-14 14:04:23 -08:00			`SCRAPER_MAPPING = {`
			`Site.LINKEDIN: LinkedInScraper,`
			`Site.INDEED: IndeedScraper,`
			`Site.ZIP_RECRUITER: ZipRecruiterScraper,`
			`Site.GLASSDOOR: GlassdoorScraper,`
			`}`

			`def map_str_to_site(site_name: str) -> Site:`
			`return Site[site_name.upper()]`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
fix: job type param bug 2023-09-21 15:42:24 -07:00			`def get_enum_from_value(value_str):`
			`for job_type in JobType:`
			`if value_str in job_type.value:`
			`return job_type`
			`raise Exception(f"Invalid job type: {value_str}")`

add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`job_type = get_enum_from_value(job_type) if job_type else None`
fix: job type param bug 2023-09-21 15:42:24 -07:00
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`def get_site_type():`
			`site_types = list(Site)`
			`if isinstance(site_name, str):`
Description format (#107) 2024-02-14 14:04:23 -08:00			`site_types = [map_str_to_site(site_name)]`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`elif isinstance(site_name, Site):`
			`site_types = [site_name]`
			`elif isinstance(site_name, list):`
			`site_types = [`
Description format (#107) 2024-02-14 14:04:23 -08:00			`map_str_to_site(site) if isinstance(site, str) else site`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`for site in site_name`
			`]`
			`return site_types`
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`country_enum = Country.from_string(country_indeed)`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`scraper_input = ScraperInput(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_type=get_site_type(),`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`country=country_enum,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`search_term=search_term,`
			`location=location,`
			`distance=distance,`
			`is_remote=is_remote,`
			`job_type=job_type,`
			`easy_apply=easy_apply,`
Description format (#107) 2024-02-14 14:04:23 -08:00			`description_format=description_format,`
			`linkedin_fetch_description=linkedin_fetch_description,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`results_wanted=results_wanted,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`linkedin_company_ids=linkedin_company_ids,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`offset=offset,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`hours_old=hours_old`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`)`

			`def scrape_site(site: Site) -> Tuple[str, JobResponse]:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`scraper_class = SCRAPER_MAPPING[site]`
			`scraper = scraper_class(proxy=proxy)`
Description format (#107) 2024-02-14 14:04:23 -08:00			`scraped_data: JobResponse = scraper.scrape(scraper_input)`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()`
			`logger.info(f"{site_name} finished scraping")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`return site.value, scraped_data`

Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict = {}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`def worker(site):`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`site_val, scraped_info = scrape_site(site)`
			`return site_val, scraped_info`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`with ThreadPoolExecutor() as executor:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`future_to_site = {`
			`executor.submit(worker, site): site for site in scraper_input.site_type`
			`}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`for future in as_completed(future_to_site):`
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`site_value, scraped_data = future.result()`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict[site_value] = scraped_data`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`jobs_dfs: list[pd.DataFrame] = []`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`for site, job_response in site_to_jobs_dict.items():`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`for job in job_response.jobs:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data = job.dict()`
			`job_data[`
			`"job_url_hyper"`
			`] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'`
			`job_data["site"] = site`
			`job_data["company"] = job_data["company_name"]`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`job_data["job_type"] = (`
			`", ".join(job_type.value[0] for job_type in job_data["job_type"])`
			`if job_data["job_type"]`
			`else None`
			`)`
			`job_data["emails"] = (`
			`", ".join(job_data["emails"]) if job_data["emails"] else None`
			`)`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`if job_data["location"]:`
			`job_data["location"] = Location(`
			`**job_data["location"]`
			`).display_location()`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`compensation_obj = job_data.get("compensation")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`if compensation_obj and isinstance(compensation_obj, dict):`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["interval"] = (`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`compensation_obj.get("interval").value`
			`if compensation_obj.get("interval")`
			`else None`
			`)`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["min_amount"] = compensation_obj.get("min_amount")`
			`job_data["max_amount"] = compensation_obj.get("max_amount")`
			`job_data["currency"] = compensation_obj.get("currency", "USD")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["interval"] = None`
			`job_data["min_amount"] = None`
			`job_data["max_amount"] = None`
			`job_data["currency"] = None`

			`job_df = pd.DataFrame([job_data])`
			`jobs_dfs.append(job_df)`

			`if jobs_dfs:`
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 1: Filter out all-NA columns from each DataFrame before concatenation`
			`filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]`

			`# Step 2: Concatenate the filtered DataFrames`
			`jobs_df = pd.concat(filtered_dfs, ignore_index=True)`

			`# Desired column order`
			`desired_order = [`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"site",`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`"job_url_hyper" if hyperlinks else "job_url",`
			`"job_url_direct",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"title",`
			`"company",`
			`"location",`
			`"job_type",`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`"date_posted",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"interval",`
			`"min_amount",`
			`"max_amount",`
			`"currency",`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`"is_remote",`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`"emails",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"description",`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00
			`"company_url",`
			`"company_url_direct",`
			`"company_addresses",`
			`"company_industry",`
			`"company_num_employees",`
			`"company_revenue",`
			`"company_description",`
			`"logo_photo_url",`
			`"banner_photo_url",`
			`"ceo_name",`
			`"ceo_photo_url",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`]`
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00
			`# Step 3: Ensure all desired columns are present, adding missing ones as empty`
			`for column in desired_order:`
			`if column not in jobs_df.columns:`
			`jobs_df[column] = None # Add missing columns as empty`

			`# Reorder the DataFrame according to the desired order`
			`jobs_df = jobs_df[desired_order]`

			`# Step 4: Sort the DataFrame as required`
			`return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Description format (#107) 2024-02-14 14:04:23 -08:00			`return pd.DataFrame()`