JobSpy/src/jobspy/__init__.py

import pandas as pd
from typing import List, Tuple

from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country


SCRAPER_MAPPING = {
    Site.LINKEDIN: LinkedInScraper,
    Site.INDEED: IndeedScraper,
    Site.ZIP_RECRUITER: ZipRecruiterScraper,
}


def _map_str_to_site(site_name: str) -> Site:
    return Site[site_name.upper()]


def scrape_jobs(
    site_name: str | Site | List[Site],
    search_term: str,
    location: str = "",
    distance: int = None,
    is_remote: bool = False,
    job_type: JobType = None,
    easy_apply: bool = False,  # linkedin
    results_wanted: int = 15,
    country: str = "usa",
) -> pd.DataFrame:
    """
    Asynchronously scrapes job data from multiple job sites.
    :return: results_wanted: pandas dataframe containing job data
    """

    if type(site_name) == str:
        site_name = _map_str_to_site(site_name)

    country_enum = Country.from_string(country)

    site_type = [site_name] if type(site_name) == Site else site_name
    scraper_input = ScraperInput(
        site_type=site_type,
        country=country_enum,
        search_term=search_term,
        location=location,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
        easy_apply=easy_apply,
        results_wanted=results_wanted,
    )

    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class()
        scraped_data: JobResponse = scraper.scrape(scraper_input)

        return site.value, scraped_data

    results = {}
    for site in scraper_input.site_type:
        site_value, scraped_data = scrape_site(site)
        results[site_value] = scraped_data

    dfs = []

    for site, job_response in results.items():
        for job in job_response.jobs:
            data = job.dict()
            data["site"] = site
            data["company"] = data["company_name"]
            if data["job_type"]:
                # Take the first value from the job type tuple
                data["job_type"] = data["job_type"].value[0]
            else:
                data["job_type"] = None

            data["location"] = Location(**data["location"]).display_location()

            compensation_obj = data.get("compensation")
            if compensation_obj and isinstance(compensation_obj, dict):
                data["interval"] = (
                    compensation_obj.get("interval").value
                    if compensation_obj.get("interval")
                    else None
                )
                data["min_amount"] = compensation_obj.get("min_amount")
                data["max_amount"] = compensation_obj.get("max_amount")
                data["currency"] = compensation_obj.get("currency", "USD")
            else:
                data["interval"] = None
                data["min_amount"] = None
                data["max_amount"] = None
                data["currency"] = None

            job_df = pd.DataFrame([data])
            dfs.append(job_df)

    if dfs:
        df = pd.concat(dfs, ignore_index=True)
        desired_order = [
            "site",
            "title",
            "company",
            "location",
            "job_type",
            "interval",
            "min_amount",
            "max_amount",
            "currency",
            "job_url",
            "description",
        ]
        df = df[desired_order]
    else:
        df = pd.DataFrame()

    return df
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`import pandas as pd`
proj structure 2023-09-03 10:05:50 -07:00			`from typing import List, Tuple`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`from .jobs import JobType, Location`
proj structure 2023-09-03 10:05:50 -07:00			`from .scrapers.indeed import IndeedScraper`
			`from .scrapers.ziprecruiter import ZipRecruiterScraper`
			`from .scrapers.linkedin import LinkedInScraper`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`from .scrapers import ScraperInput, Site, JobResponse, Country`
Library Migration (#31) 2023-09-03 07:29:25 -07:00

			`SCRAPER_MAPPING = {`
			`Site.LINKEDIN: LinkedInScraper,`
			`Site.INDEED: IndeedScraper,`
			`Site.ZIP_RECRUITER: ZipRecruiterScraper,`
			`}`


			`def _map_str_to_site(site_name: str) -> Site:`
			`return Site[site_name.upper()]`


			`def scrape_jobs(`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`site_name: str \| Site \| List[Site],`
			`search_term: str,`
			`location: str = "",`
			`distance: int = None,`
			`is_remote: bool = False,`
			`job_type: JobType = None,`
			`easy_apply: bool = False, # linkedin`
			`results_wanted: int = 15,`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`country: str = "usa",`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`) -> pd.DataFrame:`
			`"""`
			`Asynchronously scrapes job data from multiple job sites.`
			`:return: results_wanted: pandas dataframe containing job data`
			`"""`

			`if type(site_name) == str:`
			`site_name = _map_str_to_site(site_name)`

Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`country_enum = Country.from_string(country)`

Library Migration (#31) 2023-09-03 07:29:25 -07:00			`site_type = [site_name] if type(site_name) == Site else site_name`
			`scraper_input = ScraperInput(`
			`site_type=site_type,`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`country=country_enum,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`search_term=search_term,`
			`location=location,`
			`distance=distance,`
			`is_remote=is_remote,`
			`job_type=job_type,`
			`easy_apply=easy_apply,`
			`results_wanted=results_wanted,`
			`)`

			`def scrape_site(site: Site) -> Tuple[str, JobResponse]:`
			`scraper_class = SCRAPER_MAPPING[site]`
			`scraper = scraper_class()`
			`scraped_data: JobResponse = scraper.scrape(scraper_input)`

			`return site.value, scraped_data`

			`results = {}`
			`for site in scraper_input.site_type:`
			`site_value, scraped_data = scrape_site(site)`
			`results[site_value] = scraped_data`

			`dfs = []`

			`for site, job_response in results.items():`
			`for job in job_response.jobs:`
			`data = job.dict()`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`data["site"] = site`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`data["company"] = data["company_name"]`
			`if data["job_type"]:`
			`# Take the first value from the job type tuple`
			`data["job_type"] = data["job_type"].value[0]`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`data["job_type"] = None`

			`data["location"] = Location(**data["location"]).display_location()`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Validation error (#35) 2023-09-03 18:05:31 -07:00			`compensation_obj = data.get("compensation")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`if compensation_obj and isinstance(compensation_obj, dict):`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`data["interval"] = (`
			`compensation_obj.get("interval").value`
			`if compensation_obj.get("interval")`
			`else None`
			`)`
			`data["min_amount"] = compensation_obj.get("min_amount")`
			`data["max_amount"] = compensation_obj.get("max_amount")`
			`data["currency"] = compensation_obj.get("currency", "USD")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`data["interval"] = None`
			`data["min_amount"] = None`
			`data["max_amount"] = None`
			`data["currency"] = None`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
			`job_df = pd.DataFrame([data])`
			`dfs.append(job_df)`

			`if dfs:`
			`df = pd.concat(dfs, ignore_index=True)`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`desired_order = [`
			`"site",`
			`"title",`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`"company",`
			`"location",`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`"job_type",`
			`"interval",`
			`"min_amount",`
			`"max_amount",`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`"currency",`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`"job_url",`
			`"description",`
			`]`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`df = df[desired_order]`
			`else:`
			`df = pd.DataFrame()`

			`return df`