Thread sites (#40)

pull/42/head v1.1.1
Cullen Watson 2023-09-06 09:47:11 -05:00 committed by GitHub
parent 70e2218c67
commit fd883178be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1012 additions and 203 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.0"
version = "1.1.1"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
readme = "README.md"

View File

@ -1,5 +1,7 @@
import pandas as pd
from typing import List, Tuple
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict
from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
@ -7,7 +9,6 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
@ -15,6 +16,11 @@ SCRAPER_MAPPING = {
}
class ScrapeResults(NamedTuple):
jobs: pd.DataFrame
errors: pd.DataFrame
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
@ -28,8 +34,9 @@ def scrape_jobs(
job_type: JobType = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15,
country: str = "usa",
) -> pd.DataFrame:
country_indeed: str = "usa",
hyperlinks: bool = False
) -> ScrapeResults:
"""
Asynchronously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
@ -38,7 +45,7 @@ def scrape_jobs(
if type(site_name) == str:
site_name = _map_str_to_site(site_name)
country_enum = Country.from_string(country)
country_enum = Country.from_string(country_indeed)
site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput(
@ -54,22 +61,35 @@ def scrape_jobs(
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
try:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
except Exception as e:
scraped_data = JobResponse(jobs=[], error=str(e), success=False)
return site.value, scraped_data
results = {}
for site in scraper_input.site_type:
results, errors = {}, {}
def worker(site):
site_value, scraped_data = scrape_site(site)
return site_value, scraped_data
with ThreadPoolExecutor() as executor:
future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}
for future in concurrent.futures.as_completed(future_to_site):
site_value, scraped_data = future.result()
results[site_value] = scraped_data
if scraped_data.error:
errors[site_value] = scraped_data.error
dfs = []
for site, job_response in results.items():
for job in job_response.jobs:
data = job.dict()
data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
data["site"] = site
data["company"] = data["company_name"]
if data["job_type"]:
@ -99,8 +119,27 @@ def scrape_jobs(
job_df = pd.DataFrame([data])
dfs.append(job_df)
errors_list = [(key, value) for key, value in errors.items()]
errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])
if dfs:
df = pd.concat(dfs, ignore_index=True)
if hyperlinks:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url_hyper",
"description",
]
else:
desired_order = [
"site",
"title",
@ -118,4 +157,4 @@ def scrape_jobs(
else:
df = pd.DataFrame()
return df
return ScrapeResults(jobs=df, errors=errors_df)

View File

@ -27,14 +27,6 @@ class ScraperInput(BaseModel):
results_wanted: int = 15
class CommonResponse(BaseModel):
status: Optional[str]
error: Optional[str]
linkedin: Optional[Any] = None
indeed: Optional[Any] = None
zip_recruiter: Optional[Any] = None
class Scraper:
def __init__(self, site: Site):
self.site = site

View File

@ -197,7 +197,6 @@ class IndeedScraper(Scraper):
error=f"Indeed failed to parse response: {e}",
)
except Exception as e:
print(f"LinkedIn failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse(
success=False,
error=f"Indeed failed to scrape: {e}",
@ -230,11 +229,9 @@ class IndeedScraper(Scraper):
formatted_url, allow_redirects=True, timeout_seconds=5
)
except requests.exceptions.Timeout:
print("The request timed out.")
return None
if response.status_code not in range(200, 400):
print("status code not in range")
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][

View File

@ -1,7 +1,9 @@
from typing import Optional, Tuple
from datetime import datetime
import traceback
import requests
from requests.exceptions import Timeout
from bs4 import BeautifulSoup
from bs4.element import Tag
@ -67,9 +69,12 @@ class LinkedInScraper(Scraper):
)
if response.status_code != 200:
reason = ' (too many requests)' if response.status_code == 429 else ''
return JobResponse(
success=False,
error=f"Response returned {response.status_code}",
error=f"LinkedIn returned {response.status_code} {reason}",
jobs=job_list,
total_results=job_count,
)
soup = BeautifulSoup(response.text, "html.parser")
@ -113,7 +118,10 @@ class LinkedInScraper(Scraper):
description, job_type = LinkedInScraper.get_description(job_url)
if datetime_tag:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e:
date_posted = None
else:
date_posted = None
@ -130,15 +138,13 @@ class LinkedInScraper(Scraper):
),
)
job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
if processed_jobs >= job_count:
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
if len(job_list) >= scraper_input.results_wanted:
break
if processed_jobs >= job_count:
break
if len(job_list) >= scraper_input.results_wanted:
break
page += 1
@ -158,7 +164,11 @@ class LinkedInScraper(Scraper):
:param job_page_url:
:return: description or None
"""
response = requests.get(job_page_url, allow_redirects=True)
try:
response = requests.get(job_page_url, timeout=5)
except Timeout:
return None, None
if response.status_code not in range(200, 400):
return None, None

View File

@ -148,7 +148,6 @@ class ZipRecruiterScraper(Scraper):
error=f"ZipRecruiter returned status code {e.status_code}",
)
except Exception as e:
print(f"ZipRecruiter failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse(
success=False,
error=f"ZipRecruiter failed to scrape: {e}",
@ -302,7 +301,6 @@ class ZipRecruiterScraper(Scraper):
timeout_seconds=5,
)
except requests.exceptions.Timeout:
print("The request timed out.")
return None
html_string = response.content