JobSpy/src/jobspy/__init__.py

251 lines
8.8 KiB
Python
Raw Normal View History

from __future__ import annotations
2023-09-03 07:29:25 -07:00
import pandas as pd
from typing import Tuple
2024-02-09 10:05:10 -08:00
from concurrent.futures import ThreadPoolExecutor, as_completed
2023-09-03 07:29:25 -07:00
2023-09-05 10:17:22 -07:00
from .jobs import JobType, Location
2024-10-19 16:01:59 -07:00
from .scrapers.utils import set_logger_level, extract_salary, create_logger
2023-09-03 10:05:50 -07:00
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
2023-10-30 17:57:36 -07:00
from .scrapers.glassdoor import GlassdoorScraper
2023-09-03 10:05:50 -07:00
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
ZipRecruiterException,
2023-10-30 17:57:36 -07:00
GlassdoorException,
)
2023-09-03 07:29:25 -07:00
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
location: str | None = None,
2024-03-08 23:40:01 -08:00
distance: int | None = 50,
is_remote: bool = False,
job_type: str | None = None,
easy_apply: bool | None = None,
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
proxies: list[str] | str | None = None,
ca_cert: str | None = None,
2024-02-14 14:04:23 -08:00
description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
offset: int | None = 0,
hours_old: int = None,
2024-07-17 19:21:22 -07:00
enforce_annual_salary: bool = False,
verbose: int = 2,
**kwargs,
) -> pd.DataFrame:
2023-09-03 07:29:25 -07:00
"""
Simultaneously scrapes job data from multiple job sites.
2024-03-11 19:23:20 -07:00
:return: pandas dataframe containing job data
2023-09-03 07:29:25 -07:00
"""
2024-02-14 14:04:23 -08:00
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}
set_logger_level(verbose)
2024-02-14 14:04:23 -08:00
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
2023-09-03 07:29:25 -07:00
2023-09-21 15:42:24 -07:00
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return job_type
raise Exception(f"Invalid job type: {value_str}")
job_type = get_enum_from_value(job_type) if job_type else None
2023-09-21 15:42:24 -07:00
def get_site_type():
site_types = list(Site)
if isinstance(site_name, str):
2024-02-14 14:04:23 -08:00
site_types = [map_str_to_site(site_name)]
elif isinstance(site_name, Site):
site_types = [site_name]
elif isinstance(site_name, list):
site_types = [
2024-02-14 14:04:23 -08:00
map_str_to_site(site) if isinstance(site, str) else site
for site in site_name
]
return site_types
2023-09-06 07:47:11 -07:00
country_enum = Country.from_string(country_indeed)
2023-09-05 10:17:22 -07:00
2023-09-03 07:29:25 -07:00
scraper_input = ScraperInput(
site_type=get_site_type(),
2023-09-05 10:17:22 -07:00
country=country_enum,
2023-09-03 07:29:25 -07:00
search_term=search_term,
location=location,
distance=distance,
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
2024-02-14 14:04:23 -08:00
description_format=description_format,
linkedin_fetch_description=linkedin_fetch_description,
2023-09-03 07:29:25 -07:00
results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset,
hours_old=hours_old,
2023-09-03 07:29:25 -07:00
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
2024-02-14 14:04:23 -08:00
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
2024-10-19 16:01:59 -07:00
create_logger(site_name).info(f"finished scraping")
2023-09-03 07:29:25 -07:00
return site.value, scraped_data
site_to_jobs_dict = {}
2023-09-06 07:47:11 -07:00
def worker(site):
site_val, scraped_info = scrape_site(site)
return site_val, scraped_info
2023-09-06 07:47:11 -07:00
with ThreadPoolExecutor() as executor:
future_to_site = {
executor.submit(worker, site): site for site in scraper_input.site_type
}
2023-09-06 07:47:11 -07:00
2024-02-09 10:05:10 -08:00
for future in as_completed(future_to_site):
2023-09-06 07:47:11 -07:00
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data
2023-09-03 07:29:25 -07:00
2024-06-09 15:45:38 -07:00
def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080
job_data["max_amount"] *= 2080
if job_data["interval"] == "monthly":
job_data["min_amount"] *= 12
job_data["max_amount"] *= 12
if job_data["interval"] == "weekly":
job_data["min_amount"] *= 52
job_data["max_amount"] *= 52
if job_data["interval"] == "daily":
job_data["min_amount"] *= 260
job_data["max_amount"] *= 260
job_data["interval"] = "yearly"
jobs_dfs: list[pd.DataFrame] = []
2023-09-03 07:29:25 -07:00
for site, job_response in site_to_jobs_dict.items():
2023-09-03 07:29:25 -07:00
for job in job_response.jobs:
job_data = job.dict()
job_url = job_data["job_url"]
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
job_data["job_type"] = (
", ".join(job_type.value[0] for job_type in job_data["job_type"])
if job_data["job_type"]
else None
)
job_data["emails"] = (
", ".join(job_data["emails"]) if job_data["emails"] else None
)
2023-10-30 17:57:36 -07:00
if job_data["location"]:
job_data["location"] = Location(
**job_data["location"]
).display_location()
2023-09-03 07:29:25 -07:00
compensation_obj = job_data.get("compensation")
2023-09-03 07:29:25 -07:00
if compensation_obj and isinstance(compensation_obj, dict):
job_data["interval"] = (
2023-09-03 18:05:31 -07:00
compensation_obj.get("interval").value
if compensation_obj.get("interval")
else None
)
job_data["min_amount"] = compensation_obj.get("min_amount")
job_data["max_amount"] = compensation_obj.get("max_amount")
job_data["currency"] = compensation_obj.get("currency", "USD")
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
if enforce_annual_salary and (
2024-06-09 15:45:38 -07:00
job_data["interval"]
and job_data["interval"] != "yearly"
and job_data["min_amount"]
and job_data["max_amount"]
):
convert_to_annual(job_data)
2023-09-03 07:29:25 -07:00
else:
2024-06-09 15:45:38 -07:00
if country_enum == Country.USA:
(
job_data["interval"],
job_data["min_amount"],
job_data["max_amount"],
job_data["currency"],
2024-07-17 19:21:22 -07:00
) = extract_salary(
job_data["description"],
enforce_annual_salary=enforce_annual_salary,
)
job_data["salary_source"] = SalarySource.DESCRIPTION.value
2024-07-17 19:21:22 -07:00
job_data["salary_source"] = (
2024-07-21 11:05:18 -07:00
job_data["salary_source"]
if "min_amount" in job_data and job_data["min_amount"]
else None
2024-07-17 19:21:22 -07:00
)
job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df)
if jobs_dfs:
2024-02-29 19:30:56 -08:00
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
2024-02-29 19:30:56 -08:00
# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
2024-02-29 19:30:56 -08:00
# Desired column order
desired_order = [
"id",
"site",
2024-03-08 23:40:01 -08:00
"job_url_hyper" if hyperlinks else "job_url",
"job_url_direct",
"title",
"company",
"location",
"job_type",
"date_posted",
2024-07-17 19:21:22 -07:00
"salary_source",
"interval",
"min_amount",
"max_amount",
"currency",
"is_remote",
"job_level",
2024-05-28 14:01:29 -07:00
"job_function",
2024-07-15 19:19:01 -07:00
"company_industry",
2024-07-15 18:30:04 -07:00
"listing_type",
"emails",
"description",
2024-03-08 23:40:01 -08:00
"company_url",
2024-10-19 16:01:59 -07:00
"logo_photo_url",
2024-03-08 23:40:01 -08:00
"company_url_direct",
"company_addresses",
"company_num_employees",
"company_revenue",
"company_description",
]
2024-02-29 19:30:56 -08:00
# Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order:
if column not in jobs_df.columns:
jobs_df[column] = None # Add missing columns as empty
2024-02-29 19:30:56 -08:00
# Reorder the DataFrame according to the desired order
jobs_df = jobs_df[desired_order]
2024-02-29 19:30:56 -08:00
# Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
2023-09-03 07:29:25 -07:00
else:
2024-02-14 14:04:23 -08:00
return pd.DataFrame()