mirror of https://github.com/Bunsly/JobSpy
black format
parent
f584f1b32f
commit
184063edb6
|
@ -24,15 +24,14 @@ def _map_str_to_site(site_name: str) -> Site:
|
||||||
|
|
||||||
|
|
||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
site_name: str | Site | List[Site],
|
site_name: str | Site | List[Site],
|
||||||
search_term: str,
|
search_term: str,
|
||||||
|
location: str = "",
|
||||||
location: str = "",
|
distance: int = None,
|
||||||
distance: int = None,
|
is_remote: bool = False,
|
||||||
is_remote: bool = False,
|
job_type: JobType = None,
|
||||||
job_type: JobType = None,
|
easy_apply: bool = False, # linkedin
|
||||||
easy_apply: bool = False, # linkedin
|
results_wanted: int = 15,
|
||||||
results_wanted: int = 15
|
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrapes job data from multiple job sites.
|
Asynchronously scrapes job data from multiple job sites.
|
||||||
|
@ -71,48 +70,59 @@ def scrape_jobs(
|
||||||
for site, job_response in results.items():
|
for site, job_response in results.items():
|
||||||
for job in job_response.jobs:
|
for job in job_response.jobs:
|
||||||
data = job.dict()
|
data = job.dict()
|
||||||
data['site'] = site
|
data["site"] = site
|
||||||
|
|
||||||
# Formatting JobType
|
# Formatting JobType
|
||||||
data['job_type'] = data['job_type'].value if data['job_type'] else None
|
data["job_type"] = data["job_type"].value if data["job_type"] else None
|
||||||
|
|
||||||
# Formatting Location
|
# Formatting Location
|
||||||
location_obj = data.get('location')
|
location_obj = data.get("location")
|
||||||
if location_obj and isinstance(location_obj, dict):
|
if location_obj and isinstance(location_obj, dict):
|
||||||
data['city'] = location_obj.get('city', '')
|
data["city"] = location_obj.get("city", "")
|
||||||
data['state'] = location_obj.get('state', '')
|
data["state"] = location_obj.get("state", "")
|
||||||
data['country'] = location_obj.get('country', 'USA')
|
data["country"] = location_obj.get("country", "USA")
|
||||||
else:
|
else:
|
||||||
data['city'] = None
|
data["city"] = None
|
||||||
data['state'] = None
|
data["state"] = None
|
||||||
data['country'] = None
|
data["country"] = None
|
||||||
|
|
||||||
# Formatting Compensation
|
# Formatting Compensation
|
||||||
compensation_obj = data.get('compensation')
|
compensation_obj = data.get("compensation")
|
||||||
if compensation_obj and isinstance(compensation_obj, dict):
|
if compensation_obj and isinstance(compensation_obj, dict):
|
||||||
data['interval'] = compensation_obj.get('interval').value if compensation_obj.get('interval') else None
|
data["interval"] = (
|
||||||
data['min_amount'] = compensation_obj.get('min_amount')
|
compensation_obj.get("interval").value
|
||||||
data['max_amount'] = compensation_obj.get('max_amount')
|
if compensation_obj.get("interval")
|
||||||
data['currency'] = compensation_obj.get('currency', 'USD')
|
else None
|
||||||
|
)
|
||||||
|
data["min_amount"] = compensation_obj.get("min_amount")
|
||||||
|
data["max_amount"] = compensation_obj.get("max_amount")
|
||||||
|
data["currency"] = compensation_obj.get("currency", "USD")
|
||||||
else:
|
else:
|
||||||
data['interval'] = None
|
data["interval"] = None
|
||||||
data['min_amount'] = None
|
data["min_amount"] = None
|
||||||
data['max_amount'] = None
|
data["max_amount"] = None
|
||||||
data['currency'] = None
|
data["currency"] = None
|
||||||
|
|
||||||
job_df = pd.DataFrame([data])
|
job_df = pd.DataFrame([data])
|
||||||
dfs.append(job_df)
|
dfs.append(job_df)
|
||||||
|
|
||||||
if dfs:
|
if dfs:
|
||||||
df = pd.concat(dfs, ignore_index=True)
|
df = pd.concat(dfs, ignore_index=True)
|
||||||
desired_order = ['site', 'title', 'company_name', 'city', 'state','job_type',
|
desired_order = [
|
||||||
'interval', 'min_amount', 'max_amount', 'job_url', 'description',]
|
"site",
|
||||||
|
"title",
|
||||||
|
"company_name",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"job_type",
|
||||||
|
"interval",
|
||||||
|
"min_amount",
|
||||||
|
"max_amount",
|
||||||
|
"job_url",
|
||||||
|
"description",
|
||||||
|
]
|
||||||
df = df[desired_order]
|
df = df[desired_order]
|
||||||
else:
|
else:
|
||||||
df = pd.DataFrame()
|
df = pd.DataFrame()
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,14 @@ from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
from src.jobspy.jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
|
from src.jobspy.jobs import (
|
||||||
|
JobPost,
|
||||||
|
Compensation,
|
||||||
|
CompensationInterval,
|
||||||
|
Location,
|
||||||
|
JobResponse,
|
||||||
|
JobType,
|
||||||
|
)
|
||||||
from .. import Scraper, ScraperInput, Site, StatusException
|
from .. import Scraper, ScraperInput, Site, StatusException
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,10 +67,7 @@ class IndeedScraper(Scraper):
|
||||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||||
response = session.get(self.url + "/jobs", params=params)
|
response = session.get(self.url + "/jobs", params=params)
|
||||||
|
|
||||||
if (
|
if response.status_code != 200 and response.status_code != 307:
|
||||||
response.status_code != 200
|
|
||||||
and response.status_code != 307
|
|
||||||
):
|
|
||||||
raise StatusException(response.status_code)
|
raise StatusException(response.status_code)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
@ -135,8 +139,10 @@ class IndeedScraper(Scraper):
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
job_results: list[Future] = [executor.submit(process_job, job) for job in
|
job_results: list[Future] = [
|
||||||
jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]]
|
executor.submit(process_job, job)
|
||||||
|
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||||
|
]
|
||||||
|
|
||||||
job_list = [result.result() for result in job_results if result.result()]
|
job_list = [result.result() for result in job_results if result.result()]
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,14 @@ from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from src.jobspy.jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval
|
from src.jobspy.jobs import (
|
||||||
|
JobPost,
|
||||||
|
Location,
|
||||||
|
JobResponse,
|
||||||
|
JobType,
|
||||||
|
Compensation,
|
||||||
|
CompensationInterval,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class LinkedInScraper(Scraper):
|
class LinkedInScraper(Scraper):
|
||||||
|
@ -117,7 +124,9 @@ class LinkedInScraper(Scraper):
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD")
|
compensation=Compensation(
|
||||||
|
interval=CompensationInterval.YEARLY, currency="USD"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
if (
|
if (
|
||||||
|
|
|
@ -11,7 +11,14 @@ from bs4.element import Tag
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site, StatusException
|
from .. import Scraper, ScraperInput, Site, StatusException
|
||||||
from src.jobspy.jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
|
from src.jobspy.jobs import (
|
||||||
|
JobPost,
|
||||||
|
Compensation,
|
||||||
|
CompensationInterval,
|
||||||
|
Location,
|
||||||
|
JobResponse,
|
||||||
|
JobType,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
|
@ -55,7 +62,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
"search": scraper_input.search_term,
|
"search": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"page": page,
|
"page": page,
|
||||||
"form": "jobs-landing"
|
"form": "jobs-landing",
|
||||||
}
|
}
|
||||||
|
|
||||||
if scraper_input.is_remote:
|
if scraper_input.is_remote:
|
||||||
|
@ -65,7 +72,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
params["radius"] = scraper_input.distance
|
params["radius"] = scraper_input.distance
|
||||||
|
|
||||||
if job_type_value:
|
if job_type_value:
|
||||||
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
|
params[
|
||||||
|
"refine_by_employment"
|
||||||
|
] = f"employment_type:employment_type:{job_type_value}"
|
||||||
|
|
||||||
response = self.session.get(
|
response = self.session.get(
|
||||||
self.url + "/jobs-search",
|
self.url + "/jobs-search",
|
||||||
|
@ -90,11 +99,14 @@ class ZipRecruiterScraper(Scraper):
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
if "jobList" in data and data["jobList"]:
|
if "jobList" in data and data["jobList"]:
|
||||||
jobs_js = data["jobList"]
|
jobs_js = data["jobList"]
|
||||||
job_results = [executor.submit(self.process_job_js, job) for job in jobs_js]
|
job_results = [
|
||||||
|
executor.submit(self.process_job_js, job) for job in jobs_js
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
jobs_html = soup.find_all("div", {"class": "job_content"})
|
jobs_html = soup.find_all("div", {"class": "job_content"})
|
||||||
job_results = [executor.submit(self.process_job_html, job) for job in
|
job_results = [
|
||||||
jobs_html]
|
executor.submit(self.process_job_html, job) for job in jobs_html
|
||||||
|
]
|
||||||
|
|
||||||
job_list = [result.result() for result in job_results if result.result()]
|
job_list = [result.result() for result in job_results if result.result()]
|
||||||
|
|
||||||
|
@ -107,8 +119,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
:return: job_response
|
:return: job_response
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pages_to_process = max(
|
||||||
pages_to_process = max(3, math.ceil(scraper_input.results_wanted / self.jobs_per_page))
|
3, math.ceil(scraper_input.results_wanted / self.jobs_per_page)
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#: get first page to initialize session
|
#: get first page to initialize session
|
||||||
|
@ -125,7 +138,6 @@ class ZipRecruiterScraper(Scraper):
|
||||||
|
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
|
|
||||||
|
|
||||||
except StatusException as e:
|
except StatusException as e:
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
success=False,
|
success=False,
|
||||||
|
@ -162,9 +174,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
title = job.find("h2", {"class": "title"}).text
|
title = job.find("h2", {"class": "title"}).text
|
||||||
company = job.find("a", {"class": "company_name"}).text.strip()
|
company = job.find("a", {"class": "company_name"}).text.strip()
|
||||||
|
|
||||||
description, updated_job_url = self.get_description(
|
description, updated_job_url = self.get_description(job_url)
|
||||||
job_url
|
|
||||||
)
|
|
||||||
if updated_job_url is not None:
|
if updated_job_url is not None:
|
||||||
job_url = updated_job_url
|
job_url = updated_job_url
|
||||||
if description is None:
|
if description is None:
|
||||||
|
@ -173,10 +183,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
job_type_element = job.find("li", {"class": "perk_item perk_type"})
|
job_type_element = job.find("li", {"class": "perk_item perk_type"})
|
||||||
if job_type_element:
|
if job_type_element:
|
||||||
job_type_text = (
|
job_type_text = (
|
||||||
job_type_element.text.strip()
|
job_type_element.text.strip().lower().replace("-", "").replace(" ", "")
|
||||||
.lower()
|
|
||||||
.replace("-", "")
|
|
||||||
.replace(" ", "")
|
|
||||||
)
|
)
|
||||||
if job_type_text == "contractor":
|
if job_type_text == "contractor":
|
||||||
job_type_text = "contract"
|
job_type_text = "contract"
|
||||||
|
@ -201,12 +208,16 @@ class ZipRecruiterScraper(Scraper):
|
||||||
def process_job_js(self, job: dict) -> JobPost:
|
def process_job_js(self, job: dict) -> JobPost:
|
||||||
# Map the job data to the expected fields by the Pydantic model
|
# Map the job data to the expected fields by the Pydantic model
|
||||||
title = job.get("Title")
|
title = job.get("Title")
|
||||||
description = BeautifulSoup(job.get("Snippet","").strip(), "html.parser").get_text()
|
description = BeautifulSoup(
|
||||||
|
job.get("Snippet", "").strip(), "html.parser"
|
||||||
|
).get_text()
|
||||||
|
|
||||||
company = job.get("OrgName")
|
company = job.get("OrgName")
|
||||||
location = Location(city=job.get("City"), state=job.get("State"))
|
location = Location(city=job.get("City"), state=job.get("State"))
|
||||||
try:
|
try:
|
||||||
job_type = ZipRecruiterScraper.job_type_from_string(job.get("EmploymentType", "").replace("-", "_").lower())
|
job_type = ZipRecruiterScraper.job_type_from_string(
|
||||||
|
job.get("EmploymentType", "").replace("-", "_").lower()
|
||||||
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}")
|
# print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}")
|
||||||
return None
|
return None
|
||||||
|
@ -215,14 +226,14 @@ class ZipRecruiterScraper(Scraper):
|
||||||
salary_parts = formatted_salary.split(" ")
|
salary_parts = formatted_salary.split(" ")
|
||||||
|
|
||||||
min_salary_str = salary_parts[0][1:].replace(",", "")
|
min_salary_str = salary_parts[0][1:].replace(",", "")
|
||||||
if '.' in min_salary_str:
|
if "." in min_salary_str:
|
||||||
min_amount = int(float(min_salary_str) * 1000)
|
min_amount = int(float(min_salary_str) * 1000)
|
||||||
else:
|
else:
|
||||||
min_amount = int(min_salary_str.replace("K", "000"))
|
min_amount = int(min_salary_str.replace("K", "000"))
|
||||||
|
|
||||||
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
|
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
|
||||||
max_salary_str = salary_parts[2][1:].replace(",", "")
|
max_salary_str = salary_parts[2][1:].replace(",", "")
|
||||||
if '.' in max_salary_str:
|
if "." in max_salary_str:
|
||||||
max_amount = int(float(max_salary_str) * 1000)
|
max_amount = int(float(max_salary_str) * 1000)
|
||||||
else:
|
else:
|
||||||
max_amount = int(max_salary_str.replace("K", "000"))
|
max_amount = int(max_salary_str.replace("K", "000"))
|
||||||
|
@ -232,10 +243,12 @@ class ZipRecruiterScraper(Scraper):
|
||||||
compensation = Compensation(
|
compensation = Compensation(
|
||||||
interval=CompensationInterval.YEARLY,
|
interval=CompensationInterval.YEARLY,
|
||||||
min_amount=min_amount,
|
min_amount=min_amount,
|
||||||
max_amount=max_amount
|
max_amount=max_amount,
|
||||||
)
|
)
|
||||||
save_job_url = job.get("SaveJobURL", "")
|
save_job_url = job.get("SaveJobURL", "")
|
||||||
posted_time_match = re.search(r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url)
|
posted_time_match = re.search(
|
||||||
|
r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url
|
||||||
|
)
|
||||||
if posted_time_match:
|
if posted_time_match:
|
||||||
date_time_str = posted_time_match.group(1)
|
date_time_str = posted_time_match.group(1)
|
||||||
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
|
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
@ -269,10 +282,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
return item
|
return item
|
||||||
raise ValueError(f"Invalid value for JobType: {value}")
|
raise ValueError(f"Invalid value for JobType: {value}")
|
||||||
|
|
||||||
def get_description(
|
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
self,
|
|
||||||
job_page_url: str
|
|
||||||
) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""
|
"""
|
||||||
Retrieves job description by going to the job page url
|
Retrieves job description by going to the job page url
|
||||||
:param job_page_url:
|
:param job_page_url:
|
||||||
|
|
Loading…
Reference in New Issue