JobSpy/src/jobspy/__init__.py

import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed

from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
    LinkedInException,
    IndeedException,
    ZipRecruiterException,
    GlassdoorException,
)

SCRAPER_MAPPING = {
    Site.LINKEDIN: LinkedInScraper,
    Site.INDEED: IndeedScraper,
    Site.ZIP_RECRUITER: ZipRecruiterScraper,
    Site.GLASSDOOR: GlassdoorScraper,
}


def _map_str_to_site(site_name: str) -> Site:
    return Site[site_name.upper()]


def scrape_jobs(
    site_name: str | list[str] | Site | list[Site] | None = None,
    search_term: str | None = None,
    location: str | None = None,
    distance: int | None = None,
    is_remote: bool = False,
    job_type: str | None = None,
    easy_apply: bool | None = None,
    results_wanted: int = 15,
    country_indeed: str = "usa",
    hyperlinks: bool = False,
    proxy: str | None = None,
    full_description: bool | None = False,
    linkedin_company_ids: list[int] | None = None,
    offset: int | None = 0,
    hours_old: int = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Simultaneously scrapes job data from multiple job sites.
    :return: results_wanted: pandas dataframe containing job data
    """

    def get_enum_from_value(value_str):
        for job_type in JobType:
            if value_str in job_type.value:
                return job_type
        raise Exception(f"Invalid job type: {value_str}")

    job_type = get_enum_from_value(job_type) if job_type else None

    def get_site_type():
        site_types = list(Site)
        if isinstance(site_name, str):
            site_types = [_map_str_to_site(site_name)]
        elif isinstance(site_name, Site):
            site_types = [site_name]
        elif isinstance(site_name, list):
            site_types = [
                _map_str_to_site(site) if isinstance(site, str) else site
                for site in site_name
            ]
        return site_types

    country_enum = Country.from_string(country_indeed)

    scraper_input = ScraperInput(
        site_type=get_site_type(),
        country=country_enum,
        search_term=search_term,
        location=location,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
        easy_apply=easy_apply,
        full_description=full_description,
        results_wanted=results_wanted,
        linkedin_company_ids=linkedin_company_ids,
        offset=offset,
        hours_old=hours_old
    )

    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class(proxy=proxy)

        try:
            scraped_data: JobResponse = scraper.scrape(scraper_input)
        except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
            raise lie
        except Exception as e:
            if site == Site.LINKEDIN:
                raise LinkedInException(str(e))
            if site == Site.INDEED:
                raise IndeedException(str(e))
            if site == Site.ZIP_RECRUITER:
                raise ZipRecruiterException(str(e))
            if site == Site.GLASSDOOR:
                raise GlassdoorException(str(e))
            else:
                raise e
        return site.value, scraped_data

    site_to_jobs_dict = {}

    def worker(site):
        site_val, scraped_info = scrape_site(site)
        return site_val, scraped_info

    with ThreadPoolExecutor() as executor:
        future_to_site = {
            executor.submit(worker, site): site for site in scraper_input.site_type
        }

        for future in as_completed(future_to_site):
            site_value, scraped_data = future.result()
            site_to_jobs_dict[site_value] = scraped_data

    jobs_dfs: list[pd.DataFrame] = []

    for site, job_response in site_to_jobs_dict.items():
        for job in job_response.jobs:
            job_data = job.dict()
            job_data[
                "job_url_hyper"
            ] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
            job_data["site"] = site
            job_data["company"] = job_data["company_name"]
            job_data["job_type"] = (
                ", ".join(job_type.value[0] for job_type in job_data["job_type"])
                if job_data["job_type"]
                else None
            )
            job_data["emails"] = (
                ", ".join(job_data["emails"]) if job_data["emails"] else None
            )
            if job_data["location"]:
                job_data["location"] = Location(
                    **job_data["location"]
                ).display_location()

            compensation_obj = job_data.get("compensation")
            if compensation_obj and isinstance(compensation_obj, dict):
                job_data["interval"] = (
                    compensation_obj.get("interval").value
                    if compensation_obj.get("interval")
                    else None
                )
                job_data["min_amount"] = compensation_obj.get("min_amount")
                job_data["max_amount"] = compensation_obj.get("max_amount")
                job_data["currency"] = compensation_obj.get("currency", "USD")
            else:
                job_data["interval"] = None
                job_data["min_amount"] = None
                job_data["max_amount"] = None
                job_data["currency"] = None

            job_df = pd.DataFrame([job_data])
            jobs_dfs.append(job_df)

    if jobs_dfs:
        jobs_df = pd.concat(jobs_dfs, ignore_index=True)
        desired_order: list[str] = [
            "job_url_hyper" if hyperlinks else "job_url",
            "site",
            "title",
            "company",
            "company_url",
            "location",
            "job_type",
            "date_posted",
            "interval",
            "min_amount",
            "max_amount",
            "currency",
            "is_remote",
            "num_urgent_words",
            "benefits",
            "emails",
            "description",
        ]
        jobs_formatted_df = jobs_df[desired_order]
    else:
        jobs_formatted_df = pd.DataFrame()

    return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`import pandas as pd`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`from typing import Tuple`
enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`from concurrent.futures import ThreadPoolExecutor, as_completed`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`from .jobs import JobType, Location`
proj structure 2023-09-03 10:05:50 -07:00			`from .scrapers.indeed import IndeedScraper`
			`from .scrapers.ziprecruiter import ZipRecruiterScraper`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`from .scrapers.glassdoor import GlassdoorScraper`
proj structure 2023-09-03 10:05:50 -07:00			`from .scrapers.linkedin import LinkedInScraper`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`from .scrapers import ScraperInput, Site, JobResponse, Country`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`from .scrapers.exceptions import (`
			`LinkedInException,`
			`IndeedException,`
			`ZipRecruiterException,`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`GlassdoorException,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`)`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
			`SCRAPER_MAPPING = {`
			`Site.LINKEDIN: LinkedInScraper,`
			`Site.INDEED: IndeedScraper,`
			`Site.ZIP_RECRUITER: ZipRecruiterScraper,`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`Site.GLASSDOOR: GlassdoorScraper,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`}`


			`def _map_str_to_site(site_name: str) -> Site:`
			`return Site[site_name.upper()]`


			`def scrape_jobs(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_name: str \| list[str] \| Site \| list[Site] \| None = None,`
			`search_term: str \| None = None,`
			`location: str \| None = None,`
			`distance: int \| None = None,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`is_remote: bool = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`job_type: str \| None = None,`
			`easy_apply: bool \| None = None,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`results_wanted: int = 15,`
			`country_indeed: str = "usa",`
			`hyperlinks: bool = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`proxy: str \| None = None,`
			`full_description: bool \| None = False,`
			`linkedin_company_ids: list[int] \| None = None,`
			`offset: int \| None = 0,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`hours_old: int = None,`
			`**kwargs,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`) -> pd.DataFrame:`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`"""`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`Simultaneously scrapes job data from multiple job sites.`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`:return: results_wanted: pandas dataframe containing job data`
			`"""`

fix: job type param bug 2023-09-21 15:42:24 -07:00			`def get_enum_from_value(value_str):`
			`for job_type in JobType:`
			`if value_str in job_type.value:`
			`return job_type`
			`raise Exception(f"Invalid job type: {value_str}")`

add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`job_type = get_enum_from_value(job_type) if job_type else None`
fix: job type param bug 2023-09-21 15:42:24 -07:00
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`def get_site_type():`
			`site_types = list(Site)`
			`if isinstance(site_name, str):`
			`site_types = [_map_str_to_site(site_name)]`
			`elif isinstance(site_name, Site):`
			`site_types = [site_name]`
			`elif isinstance(site_name, list):`
			`site_types = [`
			`_map_str_to_site(site) if isinstance(site, str) else site`
			`for site in site_name`
			`]`
			`return site_types`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`country_enum = Country.from_string(country_indeed)`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`scraper_input = ScraperInput(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_type=get_site_type(),`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`country=country_enum,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`search_term=search_term,`
			`location=location,`
			`distance=distance,`
			`is_remote=is_remote,`
			`job_type=job_type,`
			`easy_apply=easy_apply,`
enh: full description param (#85) 2024-01-22 18:22:32 -08:00			`full_description=full_description,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`results_wanted=results_wanted,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`linkedin_company_ids=linkedin_company_ids,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`offset=offset,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`hours_old=hours_old`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`)`

			`def scrape_site(site: Site) -> Tuple[str, JobResponse]:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`scraper_class = SCRAPER_MAPPING[site]`
			`scraper = scraper_class(proxy=proxy)`

Thread sites (#40) 2023-09-06 07:47:11 -07:00			`try:`
			`scraped_data: JobResponse = scraper.scrape(scraper_input)`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`except (LinkedInException, IndeedException, ZipRecruiterException) as lie:`
			`raise lie`
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`except Exception as e:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`if site == Site.LINKEDIN:`
Fix Indeed exceptions on parsing description 2023-10-18 12:25:53 -07:00			`raise LinkedInException(str(e))`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`if site == Site.INDEED:`
Fix Indeed exceptions on parsing description 2023-10-18 12:25:53 -07:00			`raise IndeedException(str(e))`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`if site == Site.ZIP_RECRUITER:`
Fix Indeed exceptions on parsing description 2023-10-18 12:25:53 -07:00			`raise ZipRecruiterException(str(e))`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`if site == Site.GLASSDOOR:`
			`raise GlassdoorException(str(e))`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`else:`
			`raise e`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`return site.value, scraped_data`

Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict = {}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`def worker(site):`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`site_val, scraped_info = scrape_site(site)`
			`return site_val, scraped_info`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`with ThreadPoolExecutor() as executor:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`future_to_site = {`
			`executor.submit(worker, site): site for site in scraper_input.site_type`
			`}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`for future in as_completed(future_to_site):`
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`site_value, scraped_data = future.result()`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict[site_value] = scraped_data`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`jobs_dfs: list[pd.DataFrame] = []`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`for site, job_response in site_to_jobs_dict.items():`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`for job in job_response.jobs:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data = job.dict()`
			`job_data[`
			`"job_url_hyper"`
			`] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'`
			`job_data["site"] = site`
			`job_data["company"] = job_data["company_name"]`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`job_data["job_type"] = (`
			`", ".join(job_type.value[0] for job_type in job_data["job_type"])`
			`if job_data["job_type"]`
			`else None`
			`)`
			`job_data["emails"] = (`
			`", ".join(job_data["emails"]) if job_data["emails"] else None`
			`)`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`if job_data["location"]:`
			`job_data["location"] = Location(`
			`**job_data["location"]`
			`).display_location()`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`compensation_obj = job_data.get("compensation")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`if compensation_obj and isinstance(compensation_obj, dict):`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["interval"] = (`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`compensation_obj.get("interval").value`
			`if compensation_obj.get("interval")`
			`else None`
			`)`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["min_amount"] = compensation_obj.get("min_amount")`
			`job_data["max_amount"] = compensation_obj.get("max_amount")`
			`job_data["currency"] = compensation_obj.get("currency", "USD")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["interval"] = None`
			`job_data["min_amount"] = None`
			`job_data["max_amount"] = None`
			`job_data["currency"] = None`

			`job_df = pd.DataFrame([job_data])`
			`jobs_dfs.append(job_df)`

			`if jobs_dfs:`
			`jobs_df = pd.concat(jobs_dfs, ignore_index=True)`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`desired_order: list[str] = [`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`"job_url_hyper" if hyperlinks else "job_url",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"site",`
			`"title",`
			`"company",`
fix linkedin bug & add linkedin company url (#67) 2023-11-08 13:51:07 -08:00			`"company_url",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"location",`
			`"job_type",`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`"date_posted",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"interval",`
			`"min_amount",`
			`"max_amount",`
			`"currency",`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`"is_remote",`
			`"num_urgent_words",`
			`"benefits",`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`"emails",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"description",`
			`]`
			`jobs_formatted_df = jobs_df[desired_order]`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`jobs_formatted_df = pd.DataFrame()`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
fix job type search (#106) 2024-02-12 09:02:48 -08:00			`return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])`