JobSpy/jobspy/__init__.py

203 lines
7.1 KiB
Python
Raw Normal View History

from __future__ import annotations
2024-02-09 10:05:10 -08:00
from concurrent.futures import ThreadPoolExecutor, as_completed
2025-02-21 12:14:55 -08:00
from typing import Tuple
import pandas as pd
2023-09-03 07:29:25 -07:00
2025-02-21 12:14:55 -08:00
from jobspy.bayt import BaytScraper
from jobspy.glassdoor import Glassdoor
from jobspy.google import Google
from jobspy.indeed import Indeed
from jobspy.linkedin import LinkedIn
from jobspy.model import JobType, Location, JobResponse, Country
from jobspy.model import SalarySource, ScraperInput, Site
from jobspy.util import (
set_logger_level,
extract_salary,
create_logger,
get_enum_from_value,
map_str_to_site,
convert_to_annual,
desired_order,
)
2025-02-21 12:14:55 -08:00
from jobspy.ziprecruiter import ZipRecruiter
2023-09-03 07:29:25 -07:00
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
2024-10-25 12:54:14 -07:00
google_search_term: str | None = None,
location: str | None = None,
2024-03-08 23:40:01 -08:00
distance: int | None = 50,
is_remote: bool = False,
job_type: str | None = None,
easy_apply: bool | None = None,
results_wanted: int = 15,
country_indeed: str = "usa",
proxies: list[str] | str | None = None,
ca_cert: str | None = None,
2024-02-14 14:04:23 -08:00
description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
offset: int | None = 0,
hours_old: int = None,
2024-07-17 19:21:22 -07:00
enforce_annual_salary: bool = False,
2025-02-21 10:29:28 -08:00
verbose: int = 0,
**kwargs,
) -> pd.DataFrame:
2023-09-03 07:29:25 -07:00
"""
2025-02-21 12:14:55 -08:00
Scrapes job data from job boards concurrently
:return: Pandas DataFrame containing job data
2023-09-03 07:29:25 -07:00
"""
2024-02-14 14:04:23 -08:00
SCRAPER_MAPPING = {
2025-02-21 12:14:55 -08:00
Site.LINKEDIN: LinkedIn,
Site.INDEED: Indeed,
Site.ZIP_RECRUITER: ZipRecruiter,
Site.GLASSDOOR: Glassdoor,
Site.GOOGLE: Google,
2025-02-21 03:31:29 -08:00
Site.BAYT: BaytScraper,
2024-02-14 14:04:23 -08:00
}
set_logger_level(verbose)
job_type = get_enum_from_value(job_type) if job_type else None
2023-09-21 15:42:24 -07:00
def get_site_type():
site_types = list(Site)
if isinstance(site_name, str):
2024-02-14 14:04:23 -08:00
site_types = [map_str_to_site(site_name)]
elif isinstance(site_name, Site):
site_types = [site_name]
elif isinstance(site_name, list):
site_types = [
2024-02-14 14:04:23 -08:00
map_str_to_site(site) if isinstance(site, str) else site
for site in site_name
]
return site_types
2023-09-06 07:47:11 -07:00
country_enum = Country.from_string(country_indeed)
2023-09-05 10:17:22 -07:00
2023-09-03 07:29:25 -07:00
scraper_input = ScraperInput(
site_type=get_site_type(),
2023-09-05 10:17:22 -07:00
country=country_enum,
2023-09-03 07:29:25 -07:00
search_term=search_term,
2024-10-25 12:54:14 -07:00
google_search_term=google_search_term,
2023-09-03 07:29:25 -07:00
location=location,
distance=distance,
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
2024-02-14 14:04:23 -08:00
description_format=description_format,
linkedin_fetch_description=linkedin_fetch_description,
2023-09-03 07:29:25 -07:00
results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset,
hours_old=hours_old,
2023-09-03 07:29:25 -07:00
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
2024-02-14 14:04:23 -08:00
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
2024-10-19 16:01:59 -07:00
create_logger(site_name).info(f"finished scraping")
2023-09-03 07:29:25 -07:00
return site.value, scraped_data
site_to_jobs_dict = {}
2023-09-06 07:47:11 -07:00
def worker(site):
site_val, scraped_info = scrape_site(site)
return site_val, scraped_info
2023-09-06 07:47:11 -07:00
with ThreadPoolExecutor() as executor:
future_to_site = {
executor.submit(worker, site): site for site in scraper_input.site_type
}
2023-09-06 07:47:11 -07:00
2024-02-09 10:05:10 -08:00
for future in as_completed(future_to_site):
2023-09-06 07:47:11 -07:00
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data
2023-09-03 07:29:25 -07:00
jobs_dfs: list[pd.DataFrame] = []
2023-09-03 07:29:25 -07:00
for site, job_response in site_to_jobs_dict.items():
2023-09-03 07:29:25 -07:00
for job in job_response.jobs:
job_data = job.dict()
job_url = job_data["job_url"]
job_data["site"] = site
job_data["company"] = job_data["company_name"]
job_data["job_type"] = (
", ".join(job_type.value[0] for job_type in job_data["job_type"])
if job_data["job_type"]
else None
)
job_data["emails"] = (
", ".join(job_data["emails"]) if job_data["emails"] else None
)
2023-10-30 17:57:36 -07:00
if job_data["location"]:
job_data["location"] = Location(
**job_data["location"]
).display_location()
2023-09-03 07:29:25 -07:00
compensation_obj = job_data.get("compensation")
2023-09-03 07:29:25 -07:00
if compensation_obj and isinstance(compensation_obj, dict):
job_data["interval"] = (
2023-09-03 18:05:31 -07:00
compensation_obj.get("interval").value
if compensation_obj.get("interval")
else None
)
job_data["min_amount"] = compensation_obj.get("min_amount")
job_data["max_amount"] = compensation_obj.get("max_amount")
job_data["currency"] = compensation_obj.get("currency", "USD")
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
if enforce_annual_salary and (
2024-06-09 15:45:38 -07:00
job_data["interval"]
and job_data["interval"] != "yearly"
and job_data["min_amount"]
and job_data["max_amount"]
):
convert_to_annual(job_data)
2023-09-03 07:29:25 -07:00
else:
2024-06-09 15:45:38 -07:00
if country_enum == Country.USA:
(
job_data["interval"],
job_data["min_amount"],
job_data["max_amount"],
job_data["currency"],
2024-07-17 19:21:22 -07:00
) = extract_salary(
job_data["description"],
enforce_annual_salary=enforce_annual_salary,
)
job_data["salary_source"] = SalarySource.DESCRIPTION.value
2024-07-17 19:21:22 -07:00
job_data["salary_source"] = (
2024-07-21 11:05:18 -07:00
job_data["salary_source"]
if "min_amount" in job_data and job_data["min_amount"]
else None
2024-07-17 19:21:22 -07:00
)
job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df)
if jobs_dfs:
2024-02-29 19:30:56 -08:00
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
2024-02-29 19:30:56 -08:00
# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
2024-02-29 19:30:56 -08:00
# Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order:
if column not in jobs_df.columns:
jobs_df[column] = None # Add missing columns as empty
2024-02-29 19:30:56 -08:00
# Reorder the DataFrame according to the desired order
jobs_df = jobs_df[desired_order]
2024-02-29 19:30:56 -08:00
# Step 4: Sort the DataFrame as required
2024-10-25 12:54:14 -07:00
return jobs_df.sort_values(
by=["site", "date_posted"], ascending=[True, False]
).reset_index(drop=True)
2023-09-03 07:29:25 -07:00
else:
2024-02-14 14:04:23 -08:00
return pd.DataFrame()