black format

pull/35/head
Cullen Watson 2023-09-03 19:18:55 -05:00
parent f584f1b32f
commit 184063edb6
4 changed files with 103 additions and 68 deletions

View File

@ -24,15 +24,14 @@ def _map_str_to_site(site_name: str) -> Site:
def scrape_jobs( def scrape_jobs(
site_name: str | Site | List[Site], site_name: str | Site | List[Site],
search_term: str, search_term: str,
location: str = "",
location: str = "", distance: int = None,
distance: int = None, is_remote: bool = False,
is_remote: bool = False, job_type: JobType = None,
job_type: JobType = None, easy_apply: bool = False, # linkedin
easy_apply: bool = False, # linkedin results_wanted: int = 15,
results_wanted: int = 15
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Asynchronously scrapes job data from multiple job sites. Asynchronously scrapes job data from multiple job sites.
@ -71,48 +70,59 @@ def scrape_jobs(
for site, job_response in results.items(): for site, job_response in results.items():
for job in job_response.jobs: for job in job_response.jobs:
data = job.dict() data = job.dict()
data['site'] = site data["site"] = site
# Formatting JobType # Formatting JobType
data['job_type'] = data['job_type'].value if data['job_type'] else None data["job_type"] = data["job_type"].value if data["job_type"] else None
# Formatting Location # Formatting Location
location_obj = data.get('location') location_obj = data.get("location")
if location_obj and isinstance(location_obj, dict): if location_obj and isinstance(location_obj, dict):
data['city'] = location_obj.get('city', '') data["city"] = location_obj.get("city", "")
data['state'] = location_obj.get('state', '') data["state"] = location_obj.get("state", "")
data['country'] = location_obj.get('country', 'USA') data["country"] = location_obj.get("country", "USA")
else: else:
data['city'] = None data["city"] = None
data['state'] = None data["state"] = None
data['country'] = None data["country"] = None
# Formatting Compensation # Formatting Compensation
compensation_obj = data.get('compensation') compensation_obj = data.get("compensation")
if compensation_obj and isinstance(compensation_obj, dict): if compensation_obj and isinstance(compensation_obj, dict):
data['interval'] = compensation_obj.get('interval').value if compensation_obj.get('interval') else None data["interval"] = (
data['min_amount'] = compensation_obj.get('min_amount') compensation_obj.get("interval").value
data['max_amount'] = compensation_obj.get('max_amount') if compensation_obj.get("interval")
data['currency'] = compensation_obj.get('currency', 'USD') else None
)
data["min_amount"] = compensation_obj.get("min_amount")
data["max_amount"] = compensation_obj.get("max_amount")
data["currency"] = compensation_obj.get("currency", "USD")
else: else:
data['interval'] = None data["interval"] = None
data['min_amount'] = None data["min_amount"] = None
data['max_amount'] = None data["max_amount"] = None
data['currency'] = None data["currency"] = None
job_df = pd.DataFrame([data]) job_df = pd.DataFrame([data])
dfs.append(job_df) dfs.append(job_df)
if dfs: if dfs:
df = pd.concat(dfs, ignore_index=True) df = pd.concat(dfs, ignore_index=True)
desired_order = ['site', 'title', 'company_name', 'city', 'state','job_type', desired_order = [
'interval', 'min_amount', 'max_amount', 'job_url', 'description',] "site",
"title",
"company_name",
"city",
"state",
"job_type",
"interval",
"min_amount",
"max_amount",
"job_url",
"description",
]
df = df[desired_order] df = df[desired_order]
else: else:
df = pd.DataFrame() df = pd.DataFrame()
return df return df

View File

@ -10,7 +10,14 @@ from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from src.jobspy.jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType from src.jobspy.jobs import (
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
)
from .. import Scraper, ScraperInput, Site, StatusException from .. import Scraper, ScraperInput, Site, StatusException
@ -60,10 +67,7 @@ class IndeedScraper(Scraper):
params["sc"] = "0kf:" + "".join(sc_values) + ";" params["sc"] = "0kf:" + "".join(sc_values) + ";"
response = session.get(self.url + "/jobs", params=params) response = session.get(self.url + "/jobs", params=params)
if ( if response.status_code != 200 and response.status_code != 307:
response.status_code != 200
and response.status_code != 307
):
raise StatusException(response.status_code) raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
@ -135,8 +139,10 @@ class IndeedScraper(Scraper):
return job_post return job_post
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
job_results: list[Future] = [executor.submit(process_job, job) for job in job_results: list[Future] = [
jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]] executor.submit(process_job, job)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]

View File

@ -6,7 +6,14 @@ from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from src.jobspy.jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval from src.jobspy.jobs import (
JobPost,
Location,
JobResponse,
JobType,
Compensation,
CompensationInterval,
)
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
@ -117,7 +124,9 @@ class LinkedInScraper(Scraper):
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_type=job_type, job_type=job_type,
compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD") compensation=Compensation(
interval=CompensationInterval.YEARLY, currency="USD"
),
) )
job_list.append(job_post) job_list.append(job_post)
if ( if (

View File

@ -11,7 +11,14 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site, StatusException from .. import Scraper, ScraperInput, Site, StatusException
from src.jobspy.jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType from src.jobspy.jobs import (
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
)
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@ -55,7 +62,7 @@ class ZipRecruiterScraper(Scraper):
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"page": page, "page": page,
"form": "jobs-landing" "form": "jobs-landing",
} }
if scraper_input.is_remote: if scraper_input.is_remote:
@ -65,7 +72,9 @@ class ZipRecruiterScraper(Scraper):
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
if job_type_value: if job_type_value:
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}" params[
"refine_by_employment"
] = f"employment_type:employment_type:{job_type_value}"
response = self.session.get( response = self.session.get(
self.url + "/jobs-search", self.url + "/jobs-search",
@ -90,11 +99,14 @@ class ZipRecruiterScraper(Scraper):
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
if "jobList" in data and data["jobList"]: if "jobList" in data and data["jobList"]:
jobs_js = data["jobList"] jobs_js = data["jobList"]
job_results = [executor.submit(self.process_job_js, job) for job in jobs_js] job_results = [
executor.submit(self.process_job_js, job) for job in jobs_js
]
else: else:
jobs_html = soup.find_all("div", {"class": "job_content"}) jobs_html = soup.find_all("div", {"class": "job_content"})
job_results = [executor.submit(self.process_job_html, job) for job in job_results = [
jobs_html] executor.submit(self.process_job_html, job) for job in jobs_html
]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
@ -107,8 +119,9 @@ class ZipRecruiterScraper(Scraper):
:return: job_response :return: job_response
""" """
pages_to_process = max(
pages_to_process = max(3, math.ceil(scraper_input.results_wanted / self.jobs_per_page)) 3, math.ceil(scraper_input.results_wanted / self.jobs_per_page)
)
try: try:
#: get first page to initialize session #: get first page to initialize session
@ -125,7 +138,6 @@ class ZipRecruiterScraper(Scraper):
job_list += jobs job_list += jobs
except StatusException as e: except StatusException as e:
return JobResponse( return JobResponse(
success=False, success=False,
@ -162,9 +174,7 @@ class ZipRecruiterScraper(Scraper):
title = job.find("h2", {"class": "title"}).text title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip() company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = self.get_description( description, updated_job_url = self.get_description(job_url)
job_url
)
if updated_job_url is not None: if updated_job_url is not None:
job_url = updated_job_url job_url = updated_job_url
if description is None: if description is None:
@ -173,10 +183,7 @@ class ZipRecruiterScraper(Scraper):
job_type_element = job.find("li", {"class": "perk_item perk_type"}) job_type_element = job.find("li", {"class": "perk_item perk_type"})
if job_type_element: if job_type_element:
job_type_text = ( job_type_text = (
job_type_element.text.strip() job_type_element.text.strip().lower().replace("-", "").replace(" ", "")
.lower()
.replace("-", "")
.replace(" ", "")
) )
if job_type_text == "contractor": if job_type_text == "contractor":
job_type_text = "contract" job_type_text = "contract"
@ -201,12 +208,16 @@ class ZipRecruiterScraper(Scraper):
def process_job_js(self, job: dict) -> JobPost: def process_job_js(self, job: dict) -> JobPost:
# Map the job data to the expected fields by the Pydantic model # Map the job data to the expected fields by the Pydantic model
title = job.get("Title") title = job.get("Title")
description = BeautifulSoup(job.get("Snippet","").strip(), "html.parser").get_text() description = BeautifulSoup(
job.get("Snippet", "").strip(), "html.parser"
).get_text()
company = job.get("OrgName") company = job.get("OrgName")
location = Location(city=job.get("City"), state=job.get("State")) location = Location(city=job.get("City"), state=job.get("State"))
try: try:
job_type = ZipRecruiterScraper.job_type_from_string(job.get("EmploymentType", "").replace("-", "_").lower()) job_type = ZipRecruiterScraper.job_type_from_string(
job.get("EmploymentType", "").replace("-", "_").lower()
)
except ValueError: except ValueError:
# print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}") # print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}")
return None return None
@ -215,14 +226,14 @@ class ZipRecruiterScraper(Scraper):
salary_parts = formatted_salary.split(" ") salary_parts = formatted_salary.split(" ")
min_salary_str = salary_parts[0][1:].replace(",", "") min_salary_str = salary_parts[0][1:].replace(",", "")
if '.' in min_salary_str: if "." in min_salary_str:
min_amount = int(float(min_salary_str) * 1000) min_amount = int(float(min_salary_str) * 1000)
else: else:
min_amount = int(min_salary_str.replace("K", "000")) min_amount = int(min_salary_str.replace("K", "000"))
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"): if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
max_salary_str = salary_parts[2][1:].replace(",", "") max_salary_str = salary_parts[2][1:].replace(",", "")
if '.' in max_salary_str: if "." in max_salary_str:
max_amount = int(float(max_salary_str) * 1000) max_amount = int(float(max_salary_str) * 1000)
else: else:
max_amount = int(max_salary_str.replace("K", "000")) max_amount = int(max_salary_str.replace("K", "000"))
@ -232,10 +243,12 @@ class ZipRecruiterScraper(Scraper):
compensation = Compensation( compensation = Compensation(
interval=CompensationInterval.YEARLY, interval=CompensationInterval.YEARLY,
min_amount=min_amount, min_amount=min_amount,
max_amount=max_amount max_amount=max_amount,
) )
save_job_url = job.get("SaveJobURL", "") save_job_url = job.get("SaveJobURL", "")
posted_time_match = re.search(r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url) posted_time_match = re.search(
r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url
)
if posted_time_match: if posted_time_match:
date_time_str = posted_time_match.group(1) date_time_str = posted_time_match.group(1)
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ") date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
@ -269,10 +282,7 @@ class ZipRecruiterScraper(Scraper):
return item return item
raise ValueError(f"Invalid value for JobType: {value}") raise ValueError(f"Invalid value for JobType: {value}")
def get_description( def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]:
self,
job_page_url: str
) -> Tuple[Optional[str], Optional[str]]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url: