mirror of https://github.com/Bunsly/JobSpy
203 lines
7.1 KiB
Python
203 lines
7.1 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
from typing import Tuple
|
||
|
|
||
|
import pandas as pd
|
||
|
|
||
|
from jobspy.bayt import BaytScraper
|
||
|
from jobspy.glassdoor import Glassdoor
|
||
|
from jobspy.google import Google
|
||
|
from jobspy.indeed import Indeed
|
||
|
from jobspy.linkedin import LinkedIn
|
||
|
from jobspy.model import JobType, Location, JobResponse, Country
|
||
|
from jobspy.model import SalarySource, ScraperInput, Site
|
||
|
from jobspy.util import (
|
||
|
set_logger_level,
|
||
|
extract_salary,
|
||
|
create_logger,
|
||
|
get_enum_from_value,
|
||
|
map_str_to_site,
|
||
|
convert_to_annual,
|
||
|
desired_order,
|
||
|
)
|
||
|
from jobspy.ziprecruiter import ZipRecruiter
|
||
|
|
||
|
|
||
|
def scrape_jobs(
|
||
|
site_name: str | list[str] | Site | list[Site] | None = None,
|
||
|
search_term: str | None = None,
|
||
|
google_search_term: str | None = None,
|
||
|
location: str | None = None,
|
||
|
distance: int | None = 50,
|
||
|
is_remote: bool = False,
|
||
|
job_type: str | None = None,
|
||
|
easy_apply: bool | None = None,
|
||
|
results_wanted: int = 15,
|
||
|
country_indeed: str = "usa",
|
||
|
proxies: list[str] | str | None = None,
|
||
|
ca_cert: str | None = None,
|
||
|
description_format: str = "markdown",
|
||
|
linkedin_fetch_description: bool | None = False,
|
||
|
linkedin_company_ids: list[int] | None = None,
|
||
|
offset: int | None = 0,
|
||
|
hours_old: int = None,
|
||
|
enforce_annual_salary: bool = False,
|
||
|
verbose: int = 0,
|
||
|
**kwargs,
|
||
|
) -> pd.DataFrame:
|
||
|
"""
|
||
|
Scrapes job data from job boards concurrently
|
||
|
:return: Pandas DataFrame containing job data
|
||
|
"""
|
||
|
SCRAPER_MAPPING = {
|
||
|
Site.LINKEDIN: LinkedIn,
|
||
|
Site.INDEED: Indeed,
|
||
|
Site.ZIP_RECRUITER: ZipRecruiter,
|
||
|
Site.GLASSDOOR: Glassdoor,
|
||
|
Site.GOOGLE: Google,
|
||
|
Site.BAYT: BaytScraper,
|
||
|
}
|
||
|
set_logger_level(verbose)
|
||
|
job_type = get_enum_from_value(job_type) if job_type else None
|
||
|
|
||
|
def get_site_type():
|
||
|
site_types = list(Site)
|
||
|
if isinstance(site_name, str):
|
||
|
site_types = [map_str_to_site(site_name)]
|
||
|
elif isinstance(site_name, Site):
|
||
|
site_types = [site_name]
|
||
|
elif isinstance(site_name, list):
|
||
|
site_types = [
|
||
|
map_str_to_site(site) if isinstance(site, str) else site
|
||
|
for site in site_name
|
||
|
]
|
||
|
return site_types
|
||
|
|
||
|
country_enum = Country.from_string(country_indeed)
|
||
|
|
||
|
scraper_input = ScraperInput(
|
||
|
site_type=get_site_type(),
|
||
|
country=country_enum,
|
||
|
search_term=search_term,
|
||
|
google_search_term=google_search_term,
|
||
|
location=location,
|
||
|
distance=distance,
|
||
|
is_remote=is_remote,
|
||
|
job_type=job_type,
|
||
|
easy_apply=easy_apply,
|
||
|
description_format=description_format,
|
||
|
linkedin_fetch_description=linkedin_fetch_description,
|
||
|
results_wanted=results_wanted,
|
||
|
linkedin_company_ids=linkedin_company_ids,
|
||
|
offset=offset,
|
||
|
hours_old=hours_old,
|
||
|
)
|
||
|
|
||
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||
|
scraper_class = SCRAPER_MAPPING[site]
|
||
|
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
|
||
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||
|
cap_name = site.value.capitalize()
|
||
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||
|
create_logger(site_name).info(f"finished scraping")
|
||
|
return site.value, scraped_data
|
||
|
|
||
|
site_to_jobs_dict = {}
|
||
|
|
||
|
def worker(site):
|
||
|
site_val, scraped_info = scrape_site(site)
|
||
|
return site_val, scraped_info
|
||
|
|
||
|
with ThreadPoolExecutor() as executor:
|
||
|
future_to_site = {
|
||
|
executor.submit(worker, site): site for site in scraper_input.site_type
|
||
|
}
|
||
|
|
||
|
for future in as_completed(future_to_site):
|
||
|
site_value, scraped_data = future.result()
|
||
|
site_to_jobs_dict[site_value] = scraped_data
|
||
|
|
||
|
jobs_dfs: list[pd.DataFrame] = []
|
||
|
|
||
|
for site, job_response in site_to_jobs_dict.items():
|
||
|
for job in job_response.jobs:
|
||
|
job_data = job.dict()
|
||
|
job_url = job_data["job_url"]
|
||
|
job_data["site"] = site
|
||
|
job_data["company"] = job_data["company_name"]
|
||
|
job_data["job_type"] = (
|
||
|
", ".join(job_type.value[0] for job_type in job_data["job_type"])
|
||
|
if job_data["job_type"]
|
||
|
else None
|
||
|
)
|
||
|
job_data["emails"] = (
|
||
|
", ".join(job_data["emails"]) if job_data["emails"] else None
|
||
|
)
|
||
|
if job_data["location"]:
|
||
|
job_data["location"] = Location(
|
||
|
**job_data["location"]
|
||
|
).display_location()
|
||
|
|
||
|
compensation_obj = job_data.get("compensation")
|
||
|
if compensation_obj and isinstance(compensation_obj, dict):
|
||
|
job_data["interval"] = (
|
||
|
compensation_obj.get("interval").value
|
||
|
if compensation_obj.get("interval")
|
||
|
else None
|
||
|
)
|
||
|
job_data["min_amount"] = compensation_obj.get("min_amount")
|
||
|
job_data["max_amount"] = compensation_obj.get("max_amount")
|
||
|
job_data["currency"] = compensation_obj.get("currency", "USD")
|
||
|
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
|
||
|
if enforce_annual_salary and (
|
||
|
job_data["interval"]
|
||
|
and job_data["interval"] != "yearly"
|
||
|
and job_data["min_amount"]
|
||
|
and job_data["max_amount"]
|
||
|
):
|
||
|
convert_to_annual(job_data)
|
||
|
|
||
|
else:
|
||
|
if country_enum == Country.USA:
|
||
|
(
|
||
|
job_data["interval"],
|
||
|
job_data["min_amount"],
|
||
|
job_data["max_amount"],
|
||
|
job_data["currency"],
|
||
|
) = extract_salary(
|
||
|
job_data["description"],
|
||
|
enforce_annual_salary=enforce_annual_salary,
|
||
|
)
|
||
|
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
||
|
|
||
|
job_data["salary_source"] = (
|
||
|
job_data["salary_source"]
|
||
|
if "min_amount" in job_data and job_data["min_amount"]
|
||
|
else None
|
||
|
)
|
||
|
job_df = pd.DataFrame([job_data])
|
||
|
jobs_dfs.append(job_df)
|
||
|
|
||
|
if jobs_dfs:
|
||
|
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
||
|
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
|
||
|
|
||
|
# Step 2: Concatenate the filtered DataFrames
|
||
|
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
||
|
|
||
|
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||
|
for column in desired_order:
|
||
|
if column not in jobs_df.columns:
|
||
|
jobs_df[column] = None # Add missing columns as empty
|
||
|
|
||
|
# Reorder the DataFrame according to the desired order
|
||
|
jobs_df = jobs_df[desired_order]
|
||
|
|
||
|
# Step 4: Sort the DataFrame as required
|
||
|
return jobs_df.sort_values(
|
||
|
by=["site", "date_posted"], ascending=[True, False]
|
||
|
).reset_index(drop=True)
|
||
|
else:
|
||
|
return pd.DataFrame()
|