JobSpy/src/jobspy/__init__.py

161 lines
5.0 KiB
Python
Raw Normal View History

2023-09-03 07:29:25 -07:00
import pandas as pd
2023-09-06 07:47:11 -07:00
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict
2023-09-03 07:29:25 -07:00
2023-09-05 10:17:22 -07:00
from .jobs import JobType, Location
2023-09-03 10:05:50 -07:00
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper
2023-09-05 10:17:22 -07:00
from .scrapers import ScraperInput, Site, JobResponse, Country
2023-09-03 07:29:25 -07:00
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
}
2023-09-06 07:47:11 -07:00
class ScrapeResults(NamedTuple):
jobs: pd.DataFrame
errors: pd.DataFrame
2023-09-03 07:29:25 -07:00
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def scrape_jobs(
2023-09-03 18:05:31 -07:00
site_name: str | Site | List[Site],
search_term: str,
location: str = "",
distance: int = None,
is_remote: bool = False,
job_type: JobType = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15,
2023-09-06 07:47:11 -07:00
country_indeed: str = "usa",
hyperlinks: bool = False
) -> ScrapeResults:
2023-09-03 07:29:25 -07:00
"""
Asynchronously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
"""
if type(site_name) == str:
site_name = _map_str_to_site(site_name)
2023-09-06 07:47:11 -07:00
country_enum = Country.from_string(country_indeed)
2023-09-05 10:17:22 -07:00
2023-09-03 07:29:25 -07:00
site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput(
site_type=site_type,
2023-09-05 10:17:22 -07:00
country=country_enum,
2023-09-03 07:29:25 -07:00
search_term=search_term,
location=location,
distance=distance,
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
results_wanted=results_wanted,
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
2023-09-06 07:47:11 -07:00
try:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
except Exception as e:
scraped_data = JobResponse(jobs=[], error=str(e), success=False)
2023-09-03 07:29:25 -07:00
return site.value, scraped_data
2023-09-06 07:47:11 -07:00
results, errors = {}, {}
def worker(site):
2023-09-03 07:29:25 -07:00
site_value, scraped_data = scrape_site(site)
2023-09-06 07:47:11 -07:00
return site_value, scraped_data
with ThreadPoolExecutor() as executor:
future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}
for future in concurrent.futures.as_completed(future_to_site):
site_value, scraped_data = future.result()
results[site_value] = scraped_data
if scraped_data.error:
errors[site_value] = scraped_data.error
2023-09-03 07:29:25 -07:00
dfs = []
for site, job_response in results.items():
for job in job_response.jobs:
data = job.dict()
2023-09-06 07:47:11 -07:00
data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
2023-09-03 18:05:31 -07:00
data["site"] = site
2023-09-05 10:17:22 -07:00
data["company"] = data["company_name"]
if data["job_type"]:
# Take the first value from the job type tuple
data["job_type"] = data["job_type"].value[0]
2023-09-03 07:29:25 -07:00
else:
2023-09-05 10:17:22 -07:00
data["job_type"] = None
data["location"] = Location(**data["location"]).display_location()
2023-09-03 07:29:25 -07:00
2023-09-03 18:05:31 -07:00
compensation_obj = data.get("compensation")
2023-09-03 07:29:25 -07:00
if compensation_obj and isinstance(compensation_obj, dict):
2023-09-03 18:05:31 -07:00
data["interval"] = (
compensation_obj.get("interval").value
if compensation_obj.get("interval")
else None
)
data["min_amount"] = compensation_obj.get("min_amount")
data["max_amount"] = compensation_obj.get("max_amount")
data["currency"] = compensation_obj.get("currency", "USD")
2023-09-03 07:29:25 -07:00
else:
2023-09-03 18:05:31 -07:00
data["interval"] = None
data["min_amount"] = None
data["max_amount"] = None
data["currency"] = None
2023-09-03 07:29:25 -07:00
job_df = pd.DataFrame([data])
dfs.append(job_df)
2023-09-06 07:47:11 -07:00
errors_list = [(key, value) for key, value in errors.items()]
errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])
2023-09-03 07:29:25 -07:00
if dfs:
df = pd.concat(dfs, ignore_index=True)
2023-09-06 07:47:11 -07:00
if hyperlinks:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url_hyper",
"description",
]
else:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url",
"description",
]
2023-09-03 07:29:25 -07:00
df = df[desired_order]
else:
df = pd.DataFrame()
2023-09-06 07:47:11 -07:00
return ScrapeResults(jobs=df, errors=errors_df)