mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b97c73ffd6 | ||
|
|
5b3627b244 |
@@ -67,6 +67,7 @@ Optional
|
||||
├── job_type (enum): fulltime, parttime, internship, contract
|
||||
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||
├── is_remote (bool)
|
||||
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
|
||||
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
||||
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
|
||||
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.34"
|
||||
version = "1.1.36"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
|
||||
@@ -40,6 +40,7 @@ def scrape_jobs(
|
||||
country_indeed: str = "usa",
|
||||
hyperlinks: bool = False,
|
||||
proxy: Optional[str] = None,
|
||||
full_description: Optional[bool] = False,
|
||||
offset: Optional[int] = 0,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
@@ -74,6 +75,7 @@ def scrape_jobs(
|
||||
is_remote=is_remote,
|
||||
job_type=job_type,
|
||||
easy_apply=easy_apply,
|
||||
full_description=full_description,
|
||||
results_wanted=results_wanted,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from typing import Union, Optional
|
||||
from typing import Optional
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, validator
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class JobType(Enum):
|
||||
|
||||
@@ -19,6 +19,7 @@ class ScraperInput(BaseModel):
|
||||
is_remote: bool = False
|
||||
job_type: Optional[JobType] = None
|
||||
easy_apply: bool = None # linkedin
|
||||
full_description: bool = False
|
||||
offset: int = 0
|
||||
|
||||
results_wanted: int = 15
|
||||
|
||||
@@ -5,12 +5,16 @@ jobspy.scrapers.glassdoor
|
||||
This module contains routines to scrape Glassdoor.
|
||||
"""
|
||||
import json
|
||||
from typing import Optional, Any
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional
|
||||
from datetime import datetime, timedelta
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from ..utils import count_urgent_words, extract_emails_from_text
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import GlassdoorException
|
||||
from ..utils import create_session
|
||||
from ..utils import create_session, modify_and_get_description
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
@@ -66,50 +70,70 @@ class GlassdoorScraper(Scraper):
|
||||
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
||||
|
||||
jobs = []
|
||||
for i, job in enumerate(jobs_data):
|
||||
job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][
|
||||
"linkItems"
|
||||
][i]["url"]
|
||||
if job_url in self.seen_urls:
|
||||
continue
|
||||
self.seen_urls.add(job_url)
|
||||
job = job["jobview"]
|
||||
title = job["job"]["jobTitleText"]
|
||||
company_name = job["header"]["employerNameFromSearch"]
|
||||
location_name = job["header"].get("locationName", "")
|
||||
location_type = job["header"].get("locationType", "")
|
||||
age_in_days = job["header"].get("ageInDays")
|
||||
is_remote, location = False, None
|
||||
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
|
||||
|
||||
if location_type == "S":
|
||||
is_remote = True
|
||||
else:
|
||||
location = self.parse_location(location_name)
|
||||
|
||||
compensation = self.parse_compensation(job["header"])
|
||||
|
||||
job = JobPost(
|
||||
title=title,
|
||||
company_name=company_name,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
location=location,
|
||||
compensation=compensation,
|
||||
is_remote=is_remote
|
||||
)
|
||||
jobs.append(job)
|
||||
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||
future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data}
|
||||
for future in as_completed(future_to_job_data):
|
||||
job_data = future_to_job_data[future]
|
||||
try:
|
||||
job_post = future.result()
|
||||
if job_post:
|
||||
jobs.append(job_post)
|
||||
except Exception as exc:
|
||||
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
|
||||
|
||||
return jobs, self.get_cursor_for_page(
|
||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||
)
|
||||
|
||||
def process_job(self, job_data):
|
||||
"""Processes a single job and fetches its description."""
|
||||
job_id = job_data["jobview"]["job"]["listingId"]
|
||||
job_url = f'{self.url}/job-listing/?jl={job_id}'
|
||||
if job_url in self.seen_urls:
|
||||
return None
|
||||
self.seen_urls.add(job_url)
|
||||
job = job_data["jobview"]
|
||||
title = job["job"]["jobTitleText"]
|
||||
company_name = job["header"]["employerNameFromSearch"]
|
||||
location_name = job["header"].get("locationName", "")
|
||||
location_type = job["header"].get("locationType", "")
|
||||
age_in_days = job["header"].get("ageInDays")
|
||||
is_remote, location = False, None
|
||||
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
|
||||
|
||||
if location_type == "S":
|
||||
is_remote = True
|
||||
else:
|
||||
location = self.parse_location(location_name)
|
||||
|
||||
compensation = self.parse_compensation(job["header"])
|
||||
|
||||
try:
|
||||
description = self.fetch_job_description(job_id)
|
||||
except Exception as e :
|
||||
description = None
|
||||
|
||||
job_post = JobPost(
|
||||
title=title,
|
||||
company_name=company_name,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
location=location,
|
||||
compensation=compensation,
|
||||
is_remote=is_remote,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
return job_post
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes Glassdoor for jobs with scraper_input criteria.
|
||||
:param scraper_input: Information about job search criteria.
|
||||
:return: JobResponse containing a list of jobs.
|
||||
"""
|
||||
scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||
self.country = scraper_input.country
|
||||
self.url = self.country.get_url()
|
||||
|
||||
@@ -143,6 +167,41 @@ class GlassdoorScraper(Scraper):
|
||||
|
||||
return JobResponse(jobs=all_jobs)
|
||||
|
||||
def fetch_job_description(self, job_id):
|
||||
"""Fetches the job description for a single job ID."""
|
||||
url = f"{self.url}/graph"
|
||||
body = [
|
||||
{
|
||||
"operationName": "JobDetailQuery",
|
||||
"variables": {
|
||||
"jl": job_id,
|
||||
"queryString": "q",
|
||||
"pageTypeEnum": "SERP"
|
||||
},
|
||||
"query": """
|
||||
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
|
||||
jobview: jobView(
|
||||
listingId: $jl
|
||||
contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum}
|
||||
) {
|
||||
job {
|
||||
description
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
"""
|
||||
}
|
||||
]
|
||||
response = requests.post(url, json=body, headers=GlassdoorScraper.headers())
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
data = response.json()[0]
|
||||
desc = data['data']['jobview']['job']['description']
|
||||
soup = BeautifulSoup(desc, 'html.parser')
|
||||
return modify_and_get_description(soup)
|
||||
|
||||
@staticmethod
|
||||
def parse_compensation(data: dict) -> Optional[Compensation]:
|
||||
pay_period = data.get("payPeriod")
|
||||
@@ -231,12 +290,11 @@ class GlassdoorScraper(Scraper):
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def parse_location(location_name: str) -> Location:
|
||||
def parse_location(location_name: str) -> Location | None:
|
||||
if not location_name or location_name == "Remote":
|
||||
return None
|
||||
return
|
||||
city, _, state = location_name.partition(", ")
|
||||
return Location(city=city, state=state)
|
||||
|
||||
@@ -245,7 +303,6 @@ class GlassdoorScraper(Scraper):
|
||||
for cursor_data in pagination_cursors:
|
||||
if cursor_data["pageNumber"] == page_num:
|
||||
return cursor_data["cursor"]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def headers() -> dict:
|
||||
|
||||
@@ -21,6 +21,7 @@ from ..utils import (
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
get_enum_from_job_type,
|
||||
modify_and_get_description
|
||||
)
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
@@ -78,7 +79,7 @@ class IndeedScraper(Scraper):
|
||||
if sc_values:
|
||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||
try:
|
||||
session = create_session(self.proxy, is_tls=True)
|
||||
session = create_session(self.proxy)
|
||||
response = session.get(
|
||||
f"{self.url}/jobs",
|
||||
headers=self.get_headers(),
|
||||
@@ -140,7 +141,8 @@ class IndeedScraper(Scraper):
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||
|
||||
description = self.get_description(job_url)
|
||||
description = self.get_description(job_url) if scraper_input.full_description else None
|
||||
|
||||
with io.StringIO(job["snippet"]) as f:
|
||||
soup_io = BeautifulSoup(f, "html.parser")
|
||||
li_elements = soup_io.find_all("li")
|
||||
@@ -246,9 +248,7 @@ class IndeedScraper(Scraper):
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(job_description, "html.parser")
|
||||
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
|
||||
|
||||
return text_content
|
||||
return modify_and_get_description(soup)
|
||||
|
||||
@staticmethod
|
||||
def get_job_type(job: dict) -> list[JobType] | None:
|
||||
|
||||
@@ -4,23 +4,36 @@ jobspy.scrapers.linkedin
|
||||
|
||||
This module contains routines to scrape LinkedIn.
|
||||
"""
|
||||
import time
|
||||
import random
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
import time
|
||||
from requests.exceptions import ProxyError
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from threading import Lock
|
||||
from bs4.element import Tag
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import LinkedInException
|
||||
from ..utils import create_session
|
||||
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
Country,
|
||||
Compensation
|
||||
)
|
||||
from ..utils import (
|
||||
count_urgent_words,
|
||||
extract_emails_from_text,
|
||||
get_enum_from_job_type,
|
||||
currency_parser,
|
||||
modify_and_get_description
|
||||
)
|
||||
|
||||
|
||||
class LinkedInScraper(Scraper):
|
||||
@@ -111,7 +124,7 @@ class LinkedInScraper(Scraper):
|
||||
|
||||
# Call process_job directly without threading
|
||||
try:
|
||||
job_post = self.process_job(job_card, job_url)
|
||||
job_post = self.process_job(job_card, job_url, scraper_input.full_description)
|
||||
if job_post:
|
||||
job_list.append(job_post)
|
||||
except Exception as e:
|
||||
@@ -123,7 +136,7 @@ class LinkedInScraper(Scraper):
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
|
||||
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
|
||||
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
||||
|
||||
compensation = None
|
||||
@@ -160,7 +173,7 @@ class LinkedInScraper(Scraper):
|
||||
if metadata_card
|
||||
else None
|
||||
)
|
||||
date_posted = None
|
||||
date_posted = description = job_type = None
|
||||
if datetime_tag and "datetime" in datetime_tag.attrs:
|
||||
datetime_str = datetime_tag["datetime"]
|
||||
try:
|
||||
@@ -169,9 +182,8 @@ class LinkedInScraper(Scraper):
|
||||
date_posted = None
|
||||
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
||||
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
||||
|
||||
# removed to speed up scraping
|
||||
# description, job_type = self.get_job_description(job_url)
|
||||
if full_descr:
|
||||
description, job_type = self.get_job_description(job_url)
|
||||
|
||||
return JobPost(
|
||||
title=title,
|
||||
@@ -182,10 +194,10 @@ class LinkedInScraper(Scraper):
|
||||
job_url=job_url,
|
||||
compensation=compensation,
|
||||
benefits=benefits,
|
||||
# job_type=job_type,
|
||||
# description=description,
|
||||
# emails=extract_emails_from_text(description) if description else None,
|
||||
# num_urgent_words=count_urgent_words(description) if description else None,
|
||||
job_type=job_type,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
|
||||
def get_job_description(
|
||||
@@ -214,7 +226,7 @@ class LinkedInScraper(Scraper):
|
||||
|
||||
description = None
|
||||
if div_content:
|
||||
description = " ".join(div_content.get_text().split()).strip()
|
||||
description = modify_and_get_description(div_content)
|
||||
|
||||
def get_job_type(
|
||||
soup_job_type: BeautifulSoup,
|
||||
|
||||
@@ -8,6 +8,15 @@ from requests.adapters import HTTPAdapter, Retry
|
||||
from ..jobs import JobType
|
||||
|
||||
|
||||
def modify_and_get_description(soup):
|
||||
for li in soup.find_all('li'):
|
||||
li.string = "- " + li.get_text()
|
||||
|
||||
description = soup.get_text(separator='\n').strip()
|
||||
description = re.sub(r'\n+', '\n', description)
|
||||
return description
|
||||
|
||||
|
||||
def count_urgent_words(description: str) -> int:
|
||||
"""
|
||||
Count the number of urgent words or phrases in a job description.
|
||||
|
||||
@@ -10,14 +10,13 @@ import re
|
||||
from datetime import datetime, date
|
||||
from typing import Optional, Tuple, Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import ZipRecruiterException
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
||||
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
@@ -107,9 +106,9 @@ class ZipRecruiterScraper(Scraper):
|
||||
title = job.get("name")
|
||||
job_url = job.get("job_url")
|
||||
|
||||
description = BeautifulSoup(
|
||||
job.get("job_description", "").strip(), "html.parser"
|
||||
).get_text()
|
||||
job_description_html = job.get("job_description", "").strip()
|
||||
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
||||
description = modify_and_get_description(description_soup)
|
||||
|
||||
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||
|
||||
Reference in New Issue
Block a user