diff --git a/pyproject.toml b/pyproject.toml index 5c43149..4d8e71e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.35" +version = "1.1.36" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index f018b1e..db600f1 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -1,7 +1,7 @@ -from typing import Union, Optional +from typing import Optional from datetime import date from enum import Enum -from pydantic import BaseModel, validator +from pydantic import BaseModel class JobType(Enum): diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 49099b2..69dd3e4 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -14,7 +14,7 @@ from ..utils import count_urgent_words, extract_emails_from_text from .. import Scraper, ScraperInput, Site from ..exceptions import GlassdoorException -from ..utils import create_session +from ..utils import create_session, modify_and_get_description from ...jobs import ( JobPost, Compensation, @@ -200,9 +200,7 @@ class GlassdoorScraper(Scraper): data = response.json()[0] desc = data['data']['jobview']['job']['description'] soup = BeautifulSoup(desc, 'html.parser') - description = soup.get_text(separator='\n') - - return description + return modify_and_get_description(soup) @staticmethod def parse_compensation(data: dict) -> Optional[Compensation]: @@ -292,12 +290,11 @@ class GlassdoorScraper(Scraper): for job_type in JobType: if job_type_str in job_type.value: return [job_type] - return None @staticmethod - def parse_location(location_name: str) -> Location: + def parse_location(location_name: str) -> Location | None: if not location_name or location_name == "Remote": - return None + return city, _, state = location_name.partition(", ") return Location(city=city, state=state) @@ -306,7 +303,6 @@ class GlassdoorScraper(Scraper): for cursor_data in pagination_cursors: if cursor_data["pageNumber"] == page_num: return cursor_data["cursor"] - return None @staticmethod def headers() -> dict: diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index ef7d3f2..eeb7ff8 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -21,6 +21,7 @@ from ..utils import ( extract_emails_from_text, create_session, get_enum_from_job_type, + modify_and_get_description ) from ...jobs import ( JobPost, @@ -247,9 +248,7 @@ class IndeedScraper(Scraper): return None soup = BeautifulSoup(job_description, "html.parser") - text_content = "\n".join(soup.stripped_strings) - - return text_content + return modify_and_get_description(soup) @staticmethod def get_job_type(job: dict) -> list[JobType] | None: diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 882ee1d..dcdac96 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -4,23 +4,36 @@ jobspy.scrapers.linkedin This module contains routines to scrape LinkedIn. """ +import time import random from typing import Optional from datetime import datetime import requests -import time from requests.exceptions import ProxyError -from bs4 import BeautifulSoup -from bs4.element import Tag from threading import Lock +from bs4.element import Tag +from bs4 import BeautifulSoup from urllib.parse import urlparse, urlunparse from .. import Scraper, ScraperInput, Site from ..exceptions import LinkedInException from ..utils import create_session -from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation -from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser +from ...jobs import ( + JobPost, + Location, + JobResponse, + JobType, + Country, + Compensation +) +from ..utils import ( + count_urgent_words, + extract_emails_from_text, + get_enum_from_job_type, + currency_parser, + modify_and_get_description +) class LinkedInScraper(Scraper): @@ -213,7 +226,7 @@ class LinkedInScraper(Scraper): description = None if div_content: - description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip()) + description = modify_and_get_description(div_content) def get_job_type( soup_job_type: BeautifulSoup, diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 862ff78..84e4c0b 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -8,6 +8,15 @@ from requests.adapters import HTTPAdapter, Retry from ..jobs import JobType +def modify_and_get_description(soup): + for li in soup.find_all('li'): + li.string = "- " + li.get_text() + + description = soup.get_text(separator='\n').strip() + description = re.sub(r'\n+', '\n', description) + return description + + def count_urgent_words(description: str) -> int: """ Count the number of urgent words or phrases in a job description. diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index df75be5..16a67f3 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -10,14 +10,13 @@ import re from datetime import datetime, date from typing import Optional, Tuple, Any -import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from .. import Scraper, ScraperInput, Site from ..exceptions import ZipRecruiterException -from ..utils import count_urgent_words, extract_emails_from_text, create_session from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country +from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description class ZipRecruiterScraper(Scraper): @@ -107,9 +106,9 @@ class ZipRecruiterScraper(Scraper): title = job.get("name") job_url = job.get("job_url") - description = BeautifulSoup( - job.get("job_description", "").strip(), "html.parser" - ).get_text(separator="\n") + job_description_html = job.get("job_description", "").strip() + description_soup = BeautifulSoup(job_description_html, "html.parser") + description = modify_and_get_description(description_soup) company = job["hiring_company"].get("name") if "hiring_company" in job else None country_value = "usa" if job.get("job_country") == "US" else "canada"