Compare commits

..

2 Commits

Author SHA1 Message Date
Cullen Watson
b97c73ffd6 fix: clean description (#88) 2024-01-28 21:50:41 -06:00
Cullen Watson
5b3627b244 enh: full description param (#85) 2024-01-22 20:22:32 -06:00
10 changed files with 149 additions and 68 deletions

View File

@@ -67,6 +67,7 @@ Optional
├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
├── is_remote (bool)
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.34"
version = "1.1.36"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -40,6 +40,7 @@ def scrape_jobs(
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: Optional[str] = None,
full_description: Optional[bool] = False,
offset: Optional[int] = 0,
) -> pd.DataFrame:
"""
@@ -74,6 +75,7 @@ def scrape_jobs(
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
full_description=full_description,
results_wanted=results_wanted,
offset=offset,
)

View File

@@ -1,7 +1,7 @@
from typing import Union, Optional
from typing import Optional
from datetime import date
from enum import Enum
from pydantic import BaseModel, validator
from pydantic import BaseModel
class JobType(Enum):

View File

@@ -19,6 +19,7 @@ class ScraperInput(BaseModel):
is_remote: bool = False
job_type: Optional[JobType] = None
easy_apply: bool = None # linkedin
full_description: bool = False
offset: int = 0
results_wanted: int = 15

View File

@@ -5,12 +5,16 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor.
"""
import json
from typing import Optional, Any
import requests
from bs4 import BeautifulSoup
from typing import Optional
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from ..utils import count_urgent_words, extract_emails_from_text
from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException
from ..utils import create_session
from ..utils import create_session, modify_and_get_description
from ...jobs import (
JobPost,
Compensation,
@@ -66,50 +70,70 @@ class GlassdoorScraper(Scraper):
jobs_data = res_json["data"]["jobListings"]["jobListings"]
jobs = []
for i, job in enumerate(jobs_data):
job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][
"linkItems"
][i]["url"]
if job_url in self.seen_urls:
continue
self.seen_urls.add(job_url)
job = job["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
if location_type == "S":
is_remote = True
else:
location = self.parse_location(location_name)
compensation = self.parse_compensation(job["header"])
job = JobPost(
title=title,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote
)
jobs.append(job)
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data}
for future in as_completed(future_to_job_data):
job_data = future_to_job_data[future]
try:
job_post = future.result()
if job_post:
jobs.append(job_post)
except Exception as exc:
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
)
def process_job(self, job_data):
"""Processes a single job and fetches its description."""
job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.url}/job-listing/?jl={job_id}'
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
job = job_data["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
if location_type == "S":
is_remote = True
else:
location = self.parse_location(location_name)
compensation = self.parse_compensation(job["header"])
try:
description = self.fetch_job_description(job_id)
except Exception as e :
description = None
job_post = JobPost(
title=title,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
return job_post
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Glassdoor for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.country = scraper_input.country
self.url = self.country.get_url()
@@ -143,6 +167,41 @@ class GlassdoorScraper(Scraper):
return JobResponse(jobs=all_jobs)
def fetch_job_description(self, job_id):
"""Fetches the job description for a single job ID."""
url = f"{self.url}/graph"
body = [
{
"operationName": "JobDetailQuery",
"variables": {
"jl": job_id,
"queryString": "q",
"pageTypeEnum": "SERP"
},
"query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
jobview: jobView(
listingId: $jl
contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum}
) {
job {
description
__typename
}
__typename
}
}
"""
}
]
response = requests.post(url, json=body, headers=GlassdoorScraper.headers())
if response.status_code != 200:
return None
data = response.json()[0]
desc = data['data']['jobview']['job']['description']
soup = BeautifulSoup(desc, 'html.parser')
return modify_and_get_description(soup)
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
@@ -231,12 +290,11 @@ class GlassdoorScraper(Scraper):
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def parse_location(location_name: str) -> Location:
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return None
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@@ -245,7 +303,6 @@ class GlassdoorScraper(Scraper):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
return None
@staticmethod
def headers() -> dict:

View File

@@ -21,6 +21,7 @@ from ..utils import (
extract_emails_from_text,
create_session,
get_enum_from_job_type,
modify_and_get_description
)
from ...jobs import (
JobPost,
@@ -78,7 +79,7 @@ class IndeedScraper(Scraper):
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
try:
session = create_session(self.proxy, is_tls=True)
session = create_session(self.proxy)
response = session.get(
f"{self.url}/jobs",
headers=self.get_headers(),
@@ -140,7 +141,8 @@ class IndeedScraper(Scraper):
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url)
description = self.get_description(job_url) if scraper_input.full_description else None
with io.StringIO(job["snippet"]) as f:
soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup_io.find_all("li")
@@ -246,9 +248,7 @@ class IndeedScraper(Scraper):
return None
soup = BeautifulSoup(job_description, "html.parser")
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
return text_content
return modify_and_get_description(soup)
@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:

View File

@@ -4,23 +4,36 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn.
"""
import time
import random
from typing import Optional
from datetime import datetime
import requests
import time
from requests.exceptions import ProxyError
from bs4 import BeautifulSoup
from bs4.element import Tag
from threading import Lock
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException
from ..utils import create_session
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ...jobs import (
JobPost,
Location,
JobResponse,
JobType,
Country,
Compensation
)
from ..utils import (
count_urgent_words,
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
modify_and_get_description
)
class LinkedInScraper(Scraper):
@@ -111,7 +124,7 @@ class LinkedInScraper(Scraper):
# Call process_job directly without threading
try:
job_post = self.process_job(job_card, job_url)
job_post = self.process_job(job_card, job_url, scraper_input.full_description)
if job_post:
job_list.append(job_post)
except Exception as e:
@@ -123,7 +136,7 @@ class LinkedInScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None
@@ -160,7 +173,7 @@ class LinkedInScraper(Scraper):
if metadata_card
else None
)
date_posted = None
date_posted = description = job_type = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
@@ -169,9 +182,8 @@ class LinkedInScraper(Scraper):
date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
# removed to speed up scraping
# description, job_type = self.get_job_description(job_url)
if full_descr:
description, job_type = self.get_job_description(job_url)
return JobPost(
title=title,
@@ -182,10 +194,10 @@ class LinkedInScraper(Scraper):
job_url=job_url,
compensation=compensation,
benefits=benefits,
# job_type=job_type,
# description=description,
# emails=extract_emails_from_text(description) if description else None,
# num_urgent_words=count_urgent_words(description) if description else None,
job_type=job_type,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
def get_job_description(
@@ -214,7 +226,7 @@ class LinkedInScraper(Scraper):
description = None
if div_content:
description = " ".join(div_content.get_text().split()).strip()
description = modify_and_get_description(div_content)
def get_job_type(
soup_job_type: BeautifulSoup,

View File

@@ -8,6 +8,15 @@ from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType
def modify_and_get_description(soup):
for li in soup.find_all('li'):
li.string = "- " + li.get_text()
description = soup.get_text(separator='\n').strip()
description = re.sub(r'\n+', '\n', description)
return description
def count_urgent_words(description: str) -> int:
"""
Count the number of urgent words or phrases in a job description.

View File

@@ -10,14 +10,13 @@ import re
from datetime import datetime, date
from typing import Optional, Tuple, Any
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
class ZipRecruiterScraper(Scraper):
@@ -107,9 +106,9 @@ class ZipRecruiterScraper(Scraper):
title = job.get("name")
job_url = job.get("job_url")
description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser"
).get_text()
job_description_html = job.get("job_description", "").strip()
description_soup = BeautifulSoup(job_description_html, "html.parser")
description = modify_and_get_description(description_soup)
company = job["hiring_company"].get("name") if "hiring_company" in job else None
country_value = "usa" if job.get("job_country") == "US" else "canada"