mirror of https://github.com/Bunsly/JobSpy
parent
2ec3b04777
commit
5b3627b244
|
@ -67,6 +67,7 @@ Optional
|
||||||
├── job_type (enum): fulltime, parttime, internship, contract
|
├── job_type (enum): fulltime, parttime, internship, contract
|
||||||
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||||
├── is_remote (bool)
|
├── is_remote (bool)
|
||||||
|
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
|
||||||
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
||||||
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
|
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
|
||||||
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.34"
|
version = "1.1.35"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|
|
@ -40,6 +40,7 @@ def scrape_jobs(
|
||||||
country_indeed: str = "usa",
|
country_indeed: str = "usa",
|
||||||
hyperlinks: bool = False,
|
hyperlinks: bool = False,
|
||||||
proxy: Optional[str] = None,
|
proxy: Optional[str] = None,
|
||||||
|
full_description: Optional[bool] = False,
|
||||||
offset: Optional[int] = 0,
|
offset: Optional[int] = 0,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
@ -74,6 +75,7 @@ def scrape_jobs(
|
||||||
is_remote=is_remote,
|
is_remote=is_remote,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
easy_apply=easy_apply,
|
easy_apply=easy_apply,
|
||||||
|
full_description=full_description,
|
||||||
results_wanted=results_wanted,
|
results_wanted=results_wanted,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
)
|
)
|
||||||
|
|
|
@ -19,6 +19,7 @@ class ScraperInput(BaseModel):
|
||||||
is_remote: bool = False
|
is_remote: bool = False
|
||||||
job_type: Optional[JobType] = None
|
job_type: Optional[JobType] = None
|
||||||
easy_apply: bool = None # linkedin
|
easy_apply: bool = None # linkedin
|
||||||
|
full_description: bool = False
|
||||||
offset: int = 0
|
offset: int = 0
|
||||||
|
|
||||||
results_wanted: int = 15
|
results_wanted: int = 15
|
||||||
|
|
|
@ -5,8 +5,12 @@ jobspy.scrapers.glassdoor
|
||||||
This module contains routines to scrape Glassdoor.
|
This module contains routines to scrape Glassdoor.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
from typing import Optional, Any
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import Optional
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from ..utils import count_urgent_words, extract_emails_from_text
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import GlassdoorException
|
from ..exceptions import GlassdoorException
|
||||||
|
@ -66,50 +70,70 @@ class GlassdoorScraper(Scraper):
|
||||||
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
||||||
|
|
||||||
jobs = []
|
jobs = []
|
||||||
for i, job in enumerate(jobs_data):
|
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||||
job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][
|
future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data}
|
||||||
"linkItems"
|
for future in as_completed(future_to_job_data):
|
||||||
][i]["url"]
|
job_data = future_to_job_data[future]
|
||||||
if job_url in self.seen_urls:
|
try:
|
||||||
continue
|
job_post = future.result()
|
||||||
self.seen_urls.add(job_url)
|
if job_post:
|
||||||
job = job["jobview"]
|
jobs.append(job_post)
|
||||||
title = job["job"]["jobTitleText"]
|
except Exception as exc:
|
||||||
company_name = job["header"]["employerNameFromSearch"]
|
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
|
||||||
location_name = job["header"].get("locationName", "")
|
|
||||||
location_type = job["header"].get("locationType", "")
|
|
||||||
age_in_days = job["header"].get("ageInDays")
|
|
||||||
is_remote, location = False, None
|
|
||||||
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
|
|
||||||
|
|
||||||
if location_type == "S":
|
|
||||||
is_remote = True
|
|
||||||
else:
|
|
||||||
location = self.parse_location(location_name)
|
|
||||||
|
|
||||||
compensation = self.parse_compensation(job["header"])
|
|
||||||
|
|
||||||
job = JobPost(
|
|
||||||
title=title,
|
|
||||||
company_name=company_name,
|
|
||||||
date_posted=date_posted,
|
|
||||||
job_url=job_url,
|
|
||||||
location=location,
|
|
||||||
compensation=compensation,
|
|
||||||
is_remote=is_remote
|
|
||||||
)
|
|
||||||
jobs.append(job)
|
|
||||||
|
|
||||||
return jobs, self.get_cursor_for_page(
|
return jobs, self.get_cursor_for_page(
|
||||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def process_job(self, job_data):
|
||||||
|
"""Processes a single job and fetches its description."""
|
||||||
|
job_id = job_data["jobview"]["job"]["listingId"]
|
||||||
|
job_url = f'{self.url}/job-listing/?jl={job_id}'
|
||||||
|
if job_url in self.seen_urls:
|
||||||
|
return None
|
||||||
|
self.seen_urls.add(job_url)
|
||||||
|
job = job_data["jobview"]
|
||||||
|
title = job["job"]["jobTitleText"]
|
||||||
|
company_name = job["header"]["employerNameFromSearch"]
|
||||||
|
location_name = job["header"].get("locationName", "")
|
||||||
|
location_type = job["header"].get("locationType", "")
|
||||||
|
age_in_days = job["header"].get("ageInDays")
|
||||||
|
is_remote, location = False, None
|
||||||
|
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
|
||||||
|
|
||||||
|
if location_type == "S":
|
||||||
|
is_remote = True
|
||||||
|
else:
|
||||||
|
location = self.parse_location(location_name)
|
||||||
|
|
||||||
|
compensation = self.parse_compensation(job["header"])
|
||||||
|
|
||||||
|
try:
|
||||||
|
description = self.fetch_job_description(job_id)
|
||||||
|
except Exception as e :
|
||||||
|
description = None
|
||||||
|
|
||||||
|
job_post = JobPost(
|
||||||
|
title=title,
|
||||||
|
company_name=company_name,
|
||||||
|
date_posted=date_posted,
|
||||||
|
job_url=job_url,
|
||||||
|
location=location,
|
||||||
|
compensation=compensation,
|
||||||
|
is_remote=is_remote,
|
||||||
|
description=description,
|
||||||
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
|
num_urgent_words=count_urgent_words(description) if description else None,
|
||||||
|
)
|
||||||
|
return job_post
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
Scrapes Glassdoor for jobs with scraper_input criteria.
|
Scrapes Glassdoor for jobs with scraper_input criteria.
|
||||||
:param scraper_input: Information about job search criteria.
|
:param scraper_input: Information about job search criteria.
|
||||||
:return: JobResponse containing a list of jobs.
|
:return: JobResponse containing a list of jobs.
|
||||||
"""
|
"""
|
||||||
|
scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||||
self.country = scraper_input.country
|
self.country = scraper_input.country
|
||||||
self.url = self.country.get_url()
|
self.url = self.country.get_url()
|
||||||
|
|
||||||
|
@ -143,6 +167,43 @@ class GlassdoorScraper(Scraper):
|
||||||
|
|
||||||
return JobResponse(jobs=all_jobs)
|
return JobResponse(jobs=all_jobs)
|
||||||
|
|
||||||
|
def fetch_job_description(self, job_id):
|
||||||
|
"""Fetches the job description for a single job ID."""
|
||||||
|
url = f"{self.url}/graph"
|
||||||
|
body = [
|
||||||
|
{
|
||||||
|
"operationName": "JobDetailQuery",
|
||||||
|
"variables": {
|
||||||
|
"jl": job_id,
|
||||||
|
"queryString": "q",
|
||||||
|
"pageTypeEnum": "SERP"
|
||||||
|
},
|
||||||
|
"query": """
|
||||||
|
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
|
||||||
|
jobview: jobView(
|
||||||
|
listingId: $jl
|
||||||
|
contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum}
|
||||||
|
) {
|
||||||
|
job {
|
||||||
|
description
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
response = requests.post(url, json=body, headers=GlassdoorScraper.headers())
|
||||||
|
if response.status_code != 200:
|
||||||
|
return None
|
||||||
|
data = response.json()[0]
|
||||||
|
desc = data['data']['jobview']['job']['description']
|
||||||
|
soup = BeautifulSoup(desc, 'html.parser')
|
||||||
|
description = soup.get_text(separator='\n')
|
||||||
|
|
||||||
|
return description
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_compensation(data: dict) -> Optional[Compensation]:
|
def parse_compensation(data: dict) -> Optional[Compensation]:
|
||||||
pay_period = data.get("payPeriod")
|
pay_period = data.get("payPeriod")
|
||||||
|
|
|
@ -78,7 +78,7 @@ class IndeedScraper(Scraper):
|
||||||
if sc_values:
|
if sc_values:
|
||||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||||
try:
|
try:
|
||||||
session = create_session(self.proxy, is_tls=True)
|
session = create_session(self.proxy)
|
||||||
response = session.get(
|
response = session.get(
|
||||||
f"{self.url}/jobs",
|
f"{self.url}/jobs",
|
||||||
headers=self.get_headers(),
|
headers=self.get_headers(),
|
||||||
|
@ -140,7 +140,8 @@ class IndeedScraper(Scraper):
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||||
date_posted = date_posted.strftime("%Y-%m-%d")
|
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
description = self.get_description(job_url)
|
description = self.get_description(job_url) if scraper_input.full_description else None
|
||||||
|
|
||||||
with io.StringIO(job["snippet"]) as f:
|
with io.StringIO(job["snippet"]) as f:
|
||||||
soup_io = BeautifulSoup(f, "html.parser")
|
soup_io = BeautifulSoup(f, "html.parser")
|
||||||
li_elements = soup_io.find_all("li")
|
li_elements = soup_io.find_all("li")
|
||||||
|
@ -246,7 +247,7 @@ class IndeedScraper(Scraper):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(job_description, "html.parser")
|
soup = BeautifulSoup(job_description, "html.parser")
|
||||||
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
|
text_content = "\n".join(soup.stripped_strings)
|
||||||
|
|
||||||
return text_content
|
return text_content
|
||||||
|
|
||||||
|
|
|
@ -111,7 +111,7 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
# Call process_job directly without threading
|
# Call process_job directly without threading
|
||||||
try:
|
try:
|
||||||
job_post = self.process_job(job_card, job_url)
|
job_post = self.process_job(job_card, job_url, scraper_input.full_description)
|
||||||
if job_post:
|
if job_post:
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -123,7 +123,7 @@ class LinkedInScraper(Scraper):
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
|
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
|
||||||
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
||||||
|
|
||||||
compensation = None
|
compensation = None
|
||||||
|
@ -160,7 +160,7 @@ class LinkedInScraper(Scraper):
|
||||||
if metadata_card
|
if metadata_card
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
date_posted = None
|
date_posted = description = job_type = None
|
||||||
if datetime_tag and "datetime" in datetime_tag.attrs:
|
if datetime_tag and "datetime" in datetime_tag.attrs:
|
||||||
datetime_str = datetime_tag["datetime"]
|
datetime_str = datetime_tag["datetime"]
|
||||||
try:
|
try:
|
||||||
|
@ -169,9 +169,8 @@ class LinkedInScraper(Scraper):
|
||||||
date_posted = None
|
date_posted = None
|
||||||
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
||||||
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
||||||
|
if full_descr:
|
||||||
# removed to speed up scraping
|
description, job_type = self.get_job_description(job_url)
|
||||||
# description, job_type = self.get_job_description(job_url)
|
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
|
@ -182,10 +181,10 @@ class LinkedInScraper(Scraper):
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
benefits=benefits,
|
benefits=benefits,
|
||||||
# job_type=job_type,
|
job_type=job_type,
|
||||||
# description=description,
|
description=description,
|
||||||
# emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
# num_urgent_words=count_urgent_words(description) if description else None,
|
num_urgent_words=count_urgent_words(description) if description else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_job_description(
|
def get_job_description(
|
||||||
|
@ -214,7 +213,7 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
description = None
|
description = None
|
||||||
if div_content:
|
if div_content:
|
||||||
description = " ".join(div_content.get_text().split()).strip()
|
description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip())
|
||||||
|
|
||||||
def get_job_type(
|
def get_job_type(
|
||||||
soup_job_type: BeautifulSoup,
|
soup_job_type: BeautifulSoup,
|
||||||
|
|
|
@ -109,7 +109,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
|
|
||||||
description = BeautifulSoup(
|
description = BeautifulSoup(
|
||||||
job.get("job_description", "").strip(), "html.parser"
|
job.get("job_description", "").strip(), "html.parser"
|
||||||
).get_text()
|
).get_text(separator="\n")
|
||||||
|
|
||||||
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
||||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||||
|
|
Loading…
Reference in New Issue