Proxy support (#44)

* add proxy support

* return as data frame
This commit is contained in:
Cullen Watson
2023-09-07 11:28:17 -05:00
committed by GitHub
parent a37e7f235e
commit 59f739018a
10 changed files with 366 additions and 319 deletions

View File

@@ -1,13 +1,19 @@
import pandas as pd
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict
from typing import List, Tuple, NamedTuple, Dict, Optional
import traceback
from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
ZipRecruiterException,
)
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
@@ -16,11 +22,6 @@ SCRAPER_MAPPING = {
}
class ScrapeResults(NamedTuple):
jobs: pd.DataFrame
errors: pd.DataFrame
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
@@ -35,17 +36,21 @@ def scrape_jobs(
easy_apply: bool = False, # linkedin
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False
) -> ScrapeResults:
hyperlinks: bool = False,
proxy: Optional[str] = None,
) -> pd.DataFrame:
"""
Asynchronously scrapes job data from multiple job sites.
Simultaneously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
"""
if type(site_name) == str:
site_type = [_map_str_to_site(site_name)]
else: #: if type(site_name) == list
site_type = [_map_str_to_site(site) if type(site) == str else site_name for site in site_name]
site_type = [
_map_str_to_site(site) if type(site) == str else site_name
for site in site_name
]
country_enum = Country.from_string(country_indeed)
@@ -62,99 +67,95 @@ def scrape_jobs(
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)
try:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
scraped_data = JobResponse(jobs=[], error=str(e), success=False)
# unhandled exceptions
if site == Site.LINKEDIN:
raise LinkedInException()
if site == Site.INDEED:
raise IndeedException()
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException()
else:
raise e
return site.value, scraped_data
results, errors = {}, {}
site_to_jobs_dict = {}
def worker(site):
site_value, scraped_data = scrape_site(site)
return site_value, scraped_data
with ThreadPoolExecutor() as executor:
future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}
future_to_site = {
executor.submit(worker, site): site for site in scraper_input.site_type
}
for future in concurrent.futures.as_completed(future_to_site):
site_value, scraped_data = future.result()
results[site_value] = scraped_data
if scraped_data.error:
errors[site_value] = scraped_data.error
site_to_jobs_dict[site_value] = scraped_data
dfs = []
jobs_dfs: List[pd.DataFrame] = []
for site, job_response in results.items():
for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs:
data = job.dict()
data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
data["site"] = site
data["company"] = data["company_name"]
if data["job_type"]:
job_data = job.dict()
job_data[
"job_url_hyper"
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
if job_data["job_type"]:
# Take the first value from the job type tuple
data["job_type"] = data["job_type"].value[0]
job_data["job_type"] = job_data["job_type"].value[0]
else:
data["job_type"] = None
job_data["job_type"] = None
data["location"] = Location(**data["location"]).display_location()
job_data["location"] = Location(**job_data["location"]).display_location()
compensation_obj = data.get("compensation")
compensation_obj = job_data.get("compensation")
if compensation_obj and isinstance(compensation_obj, dict):
data["interval"] = (
job_data["interval"] = (
compensation_obj.get("interval").value
if compensation_obj.get("interval")
else None
)
data["min_amount"] = compensation_obj.get("min_amount")
data["max_amount"] = compensation_obj.get("max_amount")
data["currency"] = compensation_obj.get("currency", "USD")
job_data["min_amount"] = compensation_obj.get("min_amount")
job_data["max_amount"] = compensation_obj.get("max_amount")
job_data["currency"] = compensation_obj.get("currency", "USD")
else:
data["interval"] = None
data["min_amount"] = None
data["max_amount"] = None
data["currency"] = None
job_data["interval"] = None
job_data["min_amount"] = None
job_data["max_amount"] = None
job_data["currency"] = None
job_df = pd.DataFrame([data])
dfs.append(job_df)
job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df)
errors_list = [(key, value) for key, value in errors.items()]
errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])
if dfs:
df = pd.concat(dfs, ignore_index=True)
if hyperlinks:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url_hyper",
"description",
]
else:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url",
"description",
]
df = df[desired_order]
if jobs_dfs:
jobs_df = pd.concat(jobs_dfs, ignore_index=True)
desired_order: List[str] = [
"site",
"title",
"company",
"location",
"date_posted",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url_hyper" if hyperlinks else "job_url",
"description",
]
jobs_formatted_df = jobs_df[desired_order]
else:
df = pd.DataFrame()
jobs_formatted_df = pd.DataFrame()
return ScrapeResults(jobs=df, errors=errors_df)
return jobs_formatted_df