Thread sites (#40)

pull/42/head v1.1.1
Cullen Watson 2023-09-06 09:47:11 -05:00 committed by GitHub
parent 70e2218c67
commit fd883178be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1012 additions and 203 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.0" version = "1.1.1"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
readme = "README.md" readme = "README.md"

View File

@ -1,5 +1,7 @@
import pandas as pd import pandas as pd
from typing import List, Tuple import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict
from .jobs import JobType, Location from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
@ -7,7 +9,6 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country from .scrapers import ScraperInput, Site, JobResponse, Country
SCRAPER_MAPPING = { SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper, Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper, Site.INDEED: IndeedScraper,
@ -15,6 +16,11 @@ SCRAPER_MAPPING = {
} }
class ScrapeResults(NamedTuple):
jobs: pd.DataFrame
errors: pd.DataFrame
def _map_str_to_site(site_name: str) -> Site: def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()] return Site[site_name.upper()]
@ -28,8 +34,9 @@ def scrape_jobs(
job_type: JobType = None, job_type: JobType = None,
easy_apply: bool = False, # linkedin easy_apply: bool = False, # linkedin
results_wanted: int = 15, results_wanted: int = 15,
country: str = "usa", country_indeed: str = "usa",
) -> pd.DataFrame: hyperlinks: bool = False
) -> ScrapeResults:
""" """
Asynchronously scrapes job data from multiple job sites. Asynchronously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data :return: results_wanted: pandas dataframe containing job data
@ -38,7 +45,7 @@ def scrape_jobs(
if type(site_name) == str: if type(site_name) == str:
site_name = _map_str_to_site(site_name) site_name = _map_str_to_site(site_name)
country_enum = Country.from_string(country) country_enum = Country.from_string(country_indeed)
site_type = [site_name] if type(site_name) == Site else site_name site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput( scraper_input = ScraperInput(
@ -54,22 +61,35 @@ def scrape_jobs(
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] try:
scraper = scraper_class() scraper_class = SCRAPER_MAPPING[site]
scraped_data: JobResponse = scraper.scrape(scraper_input) scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
except Exception as e:
scraped_data = JobResponse(jobs=[], error=str(e), success=False)
return site.value, scraped_data return site.value, scraped_data
results = {} results, errors = {}, {}
for site in scraper_input.site_type:
def worker(site):
site_value, scraped_data = scrape_site(site) site_value, scraped_data = scrape_site(site)
results[site_value] = scraped_data return site_value, scraped_data
with ThreadPoolExecutor() as executor:
future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}
for future in concurrent.futures.as_completed(future_to_site):
site_value, scraped_data = future.result()
results[site_value] = scraped_data
if scraped_data.error:
errors[site_value] = scraped_data.error
dfs = [] dfs = []
for site, job_response in results.items(): for site, job_response in results.items():
for job in job_response.jobs: for job in job_response.jobs:
data = job.dict() data = job.dict()
data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
data["site"] = site data["site"] = site
data["company"] = data["company_name"] data["company"] = data["company_name"]
if data["job_type"]: if data["job_type"]:
@ -99,23 +119,42 @@ def scrape_jobs(
job_df = pd.DataFrame([data]) job_df = pd.DataFrame([data])
dfs.append(job_df) dfs.append(job_df)
errors_list = [(key, value) for key, value in errors.items()]
errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])
if dfs: if dfs:
df = pd.concat(dfs, ignore_index=True) df = pd.concat(dfs, ignore_index=True)
desired_order = [ if hyperlinks:
"site", desired_order = [
"title", "site",
"company", "title",
"location", "company",
"job_type", "location",
"interval", "job_type",
"min_amount", "interval",
"max_amount", "min_amount",
"currency", "max_amount",
"job_url", "currency",
"description", "job_url_hyper",
] "description",
]
else:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url",
"description",
]
df = df[desired_order] df = df[desired_order]
else: else:
df = pd.DataFrame() df = pd.DataFrame()
return df return ScrapeResults(jobs=df, errors=errors_df)

View File

@ -27,14 +27,6 @@ class ScraperInput(BaseModel):
results_wanted: int = 15 results_wanted: int = 15
class CommonResponse(BaseModel):
status: Optional[str]
error: Optional[str]
linkedin: Optional[Any] = None
indeed: Optional[Any] = None
zip_recruiter: Optional[Any] = None
class Scraper: class Scraper:
def __init__(self, site: Site): def __init__(self, site: Site):
self.site = site self.site = site

View File

@ -197,7 +197,6 @@ class IndeedScraper(Scraper):
error=f"Indeed failed to parse response: {e}", error=f"Indeed failed to parse response: {e}",
) )
except Exception as e: except Exception as e:
print(f"LinkedIn failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse( return JobResponse(
success=False, success=False,
error=f"Indeed failed to scrape: {e}", error=f"Indeed failed to scrape: {e}",
@ -230,11 +229,9 @@ class IndeedScraper(Scraper):
formatted_url, allow_redirects=True, timeout_seconds=5 formatted_url, allow_redirects=True, timeout_seconds=5
) )
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
print("The request timed out.")
return None return None
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
print("status code not in range")
return None return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][ raw_description = response.json()["body"]["jobInfoWrapperModel"][

View File

@ -1,7 +1,9 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from datetime import datetime from datetime import datetime
import traceback
import requests import requests
from requests.exceptions import Timeout
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
@ -67,9 +69,12 @@ class LinkedInScraper(Scraper):
) )
if response.status_code != 200: if response.status_code != 200:
reason = ' (too many requests)' if response.status_code == 429 else ''
return JobResponse( return JobResponse(
success=False, success=False,
error=f"Response returned {response.status_code}", error=f"LinkedIn returned {response.status_code} {reason}",
jobs=job_list,
total_results=job_count,
) )
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@ -113,7 +118,10 @@ class LinkedInScraper(Scraper):
description, job_type = LinkedInScraper.get_description(job_url) description, job_type = LinkedInScraper.get_description(job_url)
if datetime_tag: if datetime_tag:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e:
date_posted = None
else: else:
date_posted = None date_posted = None
@ -130,15 +138,13 @@ class LinkedInScraper(Scraper):
), ),
) )
job_list.append(job_post) job_list.append(job_post)
if ( if processed_jobs >= job_count:
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break break
if ( if len(job_list) >= scraper_input.results_wanted:
len(job_list) >= scraper_input.results_wanted break
or processed_jobs >= job_count if processed_jobs >= job_count:
): break
if len(job_list) >= scraper_input.results_wanted:
break break
page += 1 page += 1
@ -158,7 +164,11 @@ class LinkedInScraper(Scraper):
:param job_page_url: :param job_page_url:
:return: description or None :return: description or None
""" """
response = requests.get(job_page_url, allow_redirects=True) try:
response = requests.get(job_page_url, timeout=5)
except Timeout:
return None, None
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None, None return None, None

View File

@ -148,7 +148,6 @@ class ZipRecruiterScraper(Scraper):
error=f"ZipRecruiter returned status code {e.status_code}", error=f"ZipRecruiter returned status code {e.status_code}",
) )
except Exception as e: except Exception as e:
print(f"ZipRecruiter failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse( return JobResponse(
success=False, success=False,
error=f"ZipRecruiter failed to scrape: {e}", error=f"ZipRecruiter failed to scrape: {e}",
@ -302,7 +301,6 @@ class ZipRecruiterScraper(Scraper):
timeout_seconds=5, timeout_seconds=5,
) )
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
print("The request timed out.")
return None return None
html_string = response.content html_string = response.content