JobSpy/src/jobspy/__init__.py

from __future__ import annotations

import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed

from .jobs import JobType, Location
from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
    LinkedInException,
    IndeedException,
    ZipRecruiterException,
    GlassdoorException,
)


def scrape_jobs(
    site_name: str | list[str] | Site | list[Site] | None = None,
    search_term: str | None = None,
    location: str | None = None,
    distance: int | None = 50,
    is_remote: bool = False,
    job_type: str | None = None,
    easy_apply: bool | None = None,
    results_wanted: int = 15,
    country_indeed: str = "usa",
    hyperlinks: bool = False,
    proxies: list[str] | str | None = None,
    ca_cert: str | None = None,
    description_format: str = "markdown",
    linkedin_fetch_description: bool | None = False,
    linkedin_company_ids: list[int] | None = None,
    offset: int | None = 0,
    hours_old: int = None,
    enforce_annual_salary: bool = False,
    verbose: int = 2,
    **kwargs,
) -> pd.DataFrame:
    """
    Simultaneously scrapes job data from multiple job sites.
    :return: pandas dataframe containing job data
    """
    SCRAPER_MAPPING = {
        Site.LINKEDIN: LinkedInScraper,
        Site.INDEED: IndeedScraper,
        Site.ZIP_RECRUITER: ZipRecruiterScraper,
        Site.GLASSDOOR: GlassdoorScraper,
    }
    set_logger_level(verbose)

    def map_str_to_site(site_name: str) -> Site:
        return Site[site_name.upper()]

    def get_enum_from_value(value_str):
        for job_type in JobType:
            if value_str in job_type.value:
                return job_type
        raise Exception(f"Invalid job type: {value_str}")

    job_type = get_enum_from_value(job_type) if job_type else None

    def get_site_type():
        site_types = list(Site)
        if isinstance(site_name, str):
            site_types = [map_str_to_site(site_name)]
        elif isinstance(site_name, Site):
            site_types = [site_name]
        elif isinstance(site_name, list):
            site_types = [
                map_str_to_site(site) if isinstance(site, str) else site
                for site in site_name
            ]
        return site_types

    country_enum = Country.from_string(country_indeed)

    scraper_input = ScraperInput(
        site_type=get_site_type(),
        country=country_enum,
        search_term=search_term,
        location=location,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
        easy_apply=easy_apply,
        description_format=description_format,
        linkedin_fetch_description=linkedin_fetch_description,
        results_wanted=results_wanted,
        linkedin_company_ids=linkedin_company_ids,
        offset=offset,
        hours_old=hours_old,
    )

    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
        scraped_data: JobResponse = scraper.scrape(scraper_input)
        cap_name = site.value.capitalize()
        site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
        create_logger(site_name).info(f"finished scraping")
        return site.value, scraped_data

    site_to_jobs_dict = {}

    def worker(site):
        site_val, scraped_info = scrape_site(site)
        return site_val, scraped_info

    with ThreadPoolExecutor() as executor:
        future_to_site = {
            executor.submit(worker, site): site for site in scraper_input.site_type
        }

        for future in as_completed(future_to_site):
            site_value, scraped_data = future.result()
            site_to_jobs_dict[site_value] = scraped_data

    def convert_to_annual(job_data: dict):
        if job_data["interval"] == "hourly":
            job_data["min_amount"] *= 2080
            job_data["max_amount"] *= 2080
        if job_data["interval"] == "monthly":
            job_data["min_amount"] *= 12
            job_data["max_amount"] *= 12
        if job_data["interval"] == "weekly":
            job_data["min_amount"] *= 52
            job_data["max_amount"] *= 52
        if job_data["interval"] == "daily":
            job_data["min_amount"] *= 260
            job_data["max_amount"] *= 260
        job_data["interval"] = "yearly"

    jobs_dfs: list[pd.DataFrame] = []

    for site, job_response in site_to_jobs_dict.items():
        for job in job_response.jobs:
            job_data = job.dict()
            job_url = job_data["job_url"]
            job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
            job_data["site"] = site
            job_data["company"] = job_data["company_name"]
            job_data["job_type"] = (
                ", ".join(job_type.value[0] for job_type in job_data["job_type"])
                if job_data["job_type"]
                else None
            )
            job_data["emails"] = (
                ", ".join(job_data["emails"]) if job_data["emails"] else None
            )
            if job_data["location"]:
                job_data["location"] = Location(
                    **job_data["location"]
                ).display_location()

            compensation_obj = job_data.get("compensation")
            if compensation_obj and isinstance(compensation_obj, dict):
                job_data["interval"] = (
                    compensation_obj.get("interval").value
                    if compensation_obj.get("interval")
                    else None
                )
                job_data["min_amount"] = compensation_obj.get("min_amount")
                job_data["max_amount"] = compensation_obj.get("max_amount")
                job_data["currency"] = compensation_obj.get("currency", "USD")
                job_data["salary_source"] = SalarySource.DIRECT_DATA.value
                if enforce_annual_salary and (
                    job_data["interval"]
                    and job_data["interval"] != "yearly"
                    and job_data["min_amount"]
                    and job_data["max_amount"]
                ):
                    convert_to_annual(job_data)

            else:
                if country_enum == Country.USA:
                    (
                        job_data["interval"],
                        job_data["min_amount"],
                        job_data["max_amount"],
                        job_data["currency"],
                    ) = extract_salary(
                        job_data["description"],
                        enforce_annual_salary=enforce_annual_salary,
                    )
                    job_data["salary_source"] = SalarySource.DESCRIPTION.value

            job_data["salary_source"] = (
                job_data["salary_source"]
                if "min_amount" in job_data and job_data["min_amount"]
                else None
            )
            job_df = pd.DataFrame([job_data])
            jobs_dfs.append(job_df)

    if jobs_dfs:
        # Step 1: Filter out all-NA columns from each DataFrame before concatenation
        filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]

        # Step 2: Concatenate the filtered DataFrames
        jobs_df = pd.concat(filtered_dfs, ignore_index=True)

        # Desired column order
        desired_order = [
            "id",
            "site",
            "job_url_hyper" if hyperlinks else "job_url",
            "job_url_direct",
            "title",
            "company",
            "location",
            "job_type",
            "date_posted",
            "salary_source",
            "interval",
            "min_amount",
            "max_amount",
            "currency",
            "is_remote",
            "job_level",
            "job_function",
            "company_industry",
            "listing_type",
            "emails",
            "description",
            "company_url",
            "logo_photo_url",
            "company_url_direct",
            "company_addresses",
            "company_num_employees",
            "company_revenue",
            "company_description",
        ]

        # Step 3: Ensure all desired columns are present, adding missing ones as empty
        for column in desired_order:
            if column not in jobs_df.columns:
                jobs_df[column] = None  # Add missing columns as empty

        # Reorder the DataFrame according to the desired order
        jobs_df = jobs_df[desired_order]

        # Step 4: Sort the DataFrame as required
        return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
    else:
        return pd.DataFrame()
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`from __future__ import annotations`

Library Migration (#31) 2023-09-03 07:29:25 -07:00			`import pandas as pd`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`from typing import Tuple`
enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`from concurrent.futures import ThreadPoolExecutor, as_completed`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`from .jobs import JobType, Location`
indeed:remove tpe (#210) 2024-10-19 16:01:59 -07:00			`from .scrapers.utils import set_logger_level, extract_salary, create_logger`
proj structure 2023-09-03 10:05:50 -07:00			`from .scrapers.indeed import IndeedScraper`
			`from .scrapers.ziprecruiter import ZipRecruiterScraper`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`from .scrapers.glassdoor import GlassdoorScraper`
proj structure 2023-09-03 10:05:50 -07:00			`from .scrapers.linkedin import LinkedInScraper`
FEAT: Optional convertion to annual and know salary source (#170) 2024-07-17 19:05:33 -07:00			`from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`from .scrapers.exceptions import (`
			`LinkedInException,`
			`IndeedException,`
			`ZipRecruiterException,`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`GlassdoorException,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`)`
Library Migration (#31) 2023-09-03 07:29:25 -07:00

			`def scrape_jobs(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_name: str \| list[str] \| Site \| list[Site] \| None = None,`
			`search_term: str \| None = None,`
			`location: str \| None = None,`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`distance: int \| None = 50,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`is_remote: bool = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`job_type: str \| None = None,`
			`easy_apply: bool \| None = None,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`results_wanted: int = 15,`
			`country_indeed: str = "usa",`
			`hyperlinks: bool = False,`
enh: proxies (#157) * enh: proxies * enh: proxies 2024-05-25 12:04:09 -07:00			`proxies: list[str] \| str \| None = None,`
FEATURE: Add the "ca_cert" setting for providing a Certification Authority certificate in order to use proxies requiring it. (#204) 2024-10-08 15:46:46 -07:00			`ca_cert: str \| None = None,`
Description format (#107) 2024-02-14 14:04:23 -08:00			`description_format: str = "markdown",`
			`linkedin_fetch_description: bool \| None = False,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`linkedin_company_ids: list[int] \| None = None,`
			`offset: int \| None = 0,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`hours_old: int = None,`
lock file (#173) 2024-07-17 19:21:22 -07:00			`enforce_annual_salary: bool = False,`
feat: Adjust log verbosity via verbose arg (#128) 2024-03-11 12:38:44 -07:00			`verbose: int = 2,`
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103) 2024-02-09 12:02:03 -08:00			`**kwargs,`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`) -> pd.DataFrame:`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`"""`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`Simultaneously scrapes job data from multiple job sites.`
fix(indeed): readd param 2024-03-11 19:23:20 -07:00			`:return: pandas dataframe containing job data`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`"""`
Description format (#107) 2024-02-14 14:04:23 -08:00			`SCRAPER_MAPPING = {`
			`Site.LINKEDIN: LinkedInScraper,`
			`Site.INDEED: IndeedScraper,`
			`Site.ZIP_RECRUITER: ZipRecruiterScraper,`
			`Site.GLASSDOOR: GlassdoorScraper,`
			`}`
feat: Adjust log verbosity via verbose arg (#128) 2024-03-11 12:38:44 -07:00			`set_logger_level(verbose)`
Description format (#107) 2024-02-14 14:04:23 -08:00
			`def map_str_to_site(site_name: str) -> Site:`
			`return Site[site_name.upper()]`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
fix: job type param bug 2023-09-21 15:42:24 -07:00			`def get_enum_from_value(value_str):`
			`for job_type in JobType:`
			`if value_str in job_type.value:`
			`return job_type`
			`raise Exception(f"Invalid job type: {value_str}")`

add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`job_type = get_enum_from_value(job_type) if job_type else None`
fix: job type param bug 2023-09-21 15:42:24 -07:00
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`def get_site_type():`
			`site_types = list(Site)`
			`if isinstance(site_name, str):`
Description format (#107) 2024-02-14 14:04:23 -08:00			`site_types = [map_str_to_site(site_name)]`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`elif isinstance(site_name, Site):`
			`site_types = [site_name]`
			`elif isinstance(site_name, list):`
			`site_types = [`
Description format (#107) 2024-02-14 14:04:23 -08:00			`map_str_to_site(site) if isinstance(site, str) else site`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`for site in site_name`
			`]`
			`return site_types`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`country_enum = Country.from_string(country_indeed)`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`scraper_input = ScraperInput(`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`site_type=get_site_type(),`
Indeed country support (#38) 2023-09-05 10:17:22 -07:00			`country=country_enum,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`search_term=search_term,`
			`location=location,`
			`distance=distance,`
			`is_remote=is_remote,`
			`job_type=job_type,`
			`easy_apply=easy_apply,`
Description format (#107) 2024-02-14 14:04:23 -08:00			`description_format=description_format,`
			`linkedin_fetch_description=linkedin_fetch_description,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`results_wanted=results_wanted,`
enh(linkedin): search by company ids (#99) 2024-02-04 07:21:45 -08:00			`linkedin_company_ids=linkedin_company_ids,`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`offset=offset,`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`hours_old=hours_old,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`)`

			`def scrape_site(site: Site) -> Tuple[str, JobResponse]:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`scraper_class = SCRAPER_MAPPING[site]`
FEATURE: Add the "ca_cert" setting for providing a Certification Authority certificate in order to use proxies requiring it. (#204) 2024-10-08 15:46:46 -07:00			`scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)`
Description format (#107) 2024-02-14 14:04:23 -08:00			`scraped_data: JobResponse = scraper.scrape(scraper_input)`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`cap_name = site.value.capitalize()`
			`site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name`
indeed:remove tpe (#210) 2024-10-19 16:01:59 -07:00			`create_logger(site_name).info(f"finished scraping")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`return site.value, scraped_data`

Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict = {}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`def worker(site):`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`site_val, scraped_info = scrape_site(site)`
			`return site_val, scraped_info`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
			`with ThreadPoolExecutor() as executor:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`future_to_site = {`
			`executor.submit(worker, site): site for site in scraper_input.site_type`
			`}`
Thread sites (#40) 2023-09-06 07:47:11 -07:00
enh: Indeed company url (#104) 2024-02-09 10:05:10 -08:00			`for future in as_completed(future_to_site):`
Thread sites (#40) 2023-09-06 07:47:11 -07:00			`site_value, scraped_data = future.result()`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`site_to_jobs_dict[site_value] = scraped_data`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Salary parse (#163) 2024-06-09 15:45:38 -07:00			`def convert_to_annual(job_data: dict):`
			`if job_data["interval"] == "hourly":`
			`job_data["min_amount"] *= 2080`
			`job_data["max_amount"] *= 2080`
			`if job_data["interval"] == "monthly":`
			`job_data["min_amount"] *= 12`
			`job_data["max_amount"] *= 12`
			`if job_data["interval"] == "weekly":`
			`job_data["min_amount"] *= 52`
			`job_data["max_amount"] *= 52`
			`if job_data["interval"] == "daily":`
			`job_data["min_amount"] *= 260`
			`job_data["max_amount"] *= 260`
			`job_data["interval"] = "yearly"`

Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`jobs_dfs: list[pd.DataFrame] = []`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`for site, job_response in site_to_jobs_dict.items():`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`for job in job_response.jobs:`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data = job.dict()`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`job_url = job_data["job_url"]`
			`job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["site"] = site`
			`job_data["company"] = job_data["company_name"]`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`job_data["job_type"] = (`
			`", ".join(job_type.value[0] for job_type in job_data["job_type"])`
			`if job_data["job_type"]`
			`else None`
			`)`
			`job_data["emails"] = (`
			`", ".join(job_data["emails"]) if job_data["emails"] else None`
			`)`
add glassdoor (#66) 2023-10-30 17:57:36 -07:00			`if job_data["location"]:`
			`job_data["location"] = Location(`
			`**job_data["location"]`
			`).display_location()`
Library Migration (#31) 2023-09-03 07:29:25 -07:00
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`compensation_obj = job_data.get("compensation")`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`if compensation_obj and isinstance(compensation_obj, dict):`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["interval"] = (`
Validation error (#35) 2023-09-03 18:05:31 -07:00			`compensation_obj.get("interval").value`
			`if compensation_obj.get("interval")`
			`else None`
			`)`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_data["min_amount"] = compensation_obj.get("min_amount")`
			`job_data["max_amount"] = compensation_obj.get("max_amount")`
			`job_data["currency"] = compensation_obj.get("currency", "USD")`
FEAT: Optional convertion to annual and know salary source (#170) 2024-07-17 19:05:33 -07:00			`job_data["salary_source"] = SalarySource.DIRECT_DATA.value`
			`if enforce_annual_salary and (`
Salary parse (#163) 2024-06-09 15:45:38 -07:00			`job_data["interval"]`
			`and job_data["interval"] != "yearly"`
			`and job_data["min_amount"]`
			`and job_data["max_amount"]`
			`):`
			`convert_to_annual(job_data)`

Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Salary parse (#163) 2024-06-09 15:45:38 -07:00			`if country_enum == Country.USA:`
			`(`
			`job_data["interval"],`
			`job_data["min_amount"],`
			`job_data["max_amount"],`
			`job_data["currency"],`
lock file (#173) 2024-07-17 19:21:22 -07:00			`) = extract_salary(`
			`job_data["description"],`
			`enforce_annual_salary=enforce_annual_salary,`
			`)`
FEAT: Optional convertion to annual and know salary source (#170) 2024-07-17 19:05:33 -07:00			`job_data["salary_source"] = SalarySource.DESCRIPTION.value`

lock file (#173) 2024-07-17 19:21:22 -07:00			`job_data["salary_source"] = (`
fix:key error (#176) 2024-07-21 11:05:18 -07:00			`job_data["salary_source"]`
			`if "min_amount" in job_data and job_data["min_amount"]`
			`else None`
lock file (#173) 2024-07-17 19:21:22 -07:00			`)`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`job_df = pd.DataFrame([job_data])`
			`jobs_dfs.append(job_df)`

			`if jobs_dfs:`
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 1: Filter out all-NA columns from each DataFrame before concatenation`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]`

Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 2: Concatenate the filtered DataFrames`
			`jobs_df = pd.concat(filtered_dfs, ignore_index=True)`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Desired column order`
			`desired_order = [`
chore: id added for JobPost schema (#152) 2024-05-20 09:45:52 -07:00			`"id",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"site",`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`"job_url_hyper" if hyperlinks else "job_url",`
			`"job_url_direct",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"title",`
			`"company",`
			`"location",`
			`"job_type",`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`"date_posted",`
lock file (#173) 2024-07-17 19:21:22 -07:00			`"salary_source",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"interval",`
			`"min_amount",`
			`"max_amount",`
			`"currency",`
Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme 2023-10-10 09:23:04 -07:00			`"is_remote",`
Add company industry and job level to linkedin scraper (#166) 2024-07-15 19:07:39 -07:00			`"job_level",`
enh(li): job function (#160) 2024-05-28 14:01:29 -07:00			`"job_function",`
minor fix 2024-07-15 19:19:01 -07:00			`"company_industry",`
enh: listing source (#168) 2024-07-15 18:30:04 -07:00			`"listing_type",`
add offset param & email extraction (#51) * add offset param * [enh]: extract emails 2023-09-28 16:11:28 -07:00			`"emails",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`"description",`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`"company_url",`
indeed:remove tpe (#210) 2024-10-19 16:01:59 -07:00			`"logo_photo_url",`
enh: indeed more fields (#126) 2024-03-08 23:40:01 -08:00			`"company_url_direct",`
			`"company_addresses",`
			`"company_num_employees",`
			`"company_revenue",`
			`"company_description",`
Proxy support (#44) * add proxy support * return as data frame 2023-09-07 09:28:17 -07:00			`]`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 3: Ensure all desired columns are present, adding missing ones as empty`
			`for column in desired_order:`
			`if column not in jobs_df.columns:`
			`jobs_df[column] = None # Add missing columns as empty`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Reorder the DataFrame according to the desired order`
			`jobs_df = jobs_df[desired_order]`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00
Remove pandas warning (#118) 2024-02-29 19:30:56 -08:00			`# Step 4: Sort the DataFrame as required`
format: Apply Black formatter to the codebase (#127) 2024-03-10 21:36:27 -07:00			`return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`else:`
Description format (#107) 2024-02-14 14:04:23 -08:00			`return pd.DataFrame()`