fix: clean description (#88)

pull/91/head^2 v1.1.36
Cullen Watson 2024-01-28 21:50:41 -06:00 committed by GitHub
parent 5b3627b244
commit b97c73ffd6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 41 additions and 25 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.35"
version = "1.1.36"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -1,7 +1,7 @@
from typing import Union, Optional
from typing import Optional
from datetime import date
from enum import Enum
from pydantic import BaseModel, validator
from pydantic import BaseModel
class JobType(Enum):

View File

@ -14,7 +14,7 @@ from ..utils import count_urgent_words, extract_emails_from_text
from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException
from ..utils import create_session
from ..utils import create_session, modify_and_get_description
from ...jobs import (
JobPost,
Compensation,
@ -200,9 +200,7 @@ class GlassdoorScraper(Scraper):
data = response.json()[0]
desc = data['data']['jobview']['job']['description']
soup = BeautifulSoup(desc, 'html.parser')
description = soup.get_text(separator='\n')
return description
return modify_and_get_description(soup)
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
@ -292,12 +290,11 @@ class GlassdoorScraper(Scraper):
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def parse_location(location_name: str) -> Location:
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return None
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@ -306,7 +303,6 @@ class GlassdoorScraper(Scraper):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]
return None
@staticmethod
def headers() -> dict:

View File

@ -21,6 +21,7 @@ from ..utils import (
extract_emails_from_text,
create_session,
get_enum_from_job_type,
modify_and_get_description
)
from ...jobs import (
JobPost,
@ -247,9 +248,7 @@ class IndeedScraper(Scraper):
return None
soup = BeautifulSoup(job_description, "html.parser")
text_content = "\n".join(soup.stripped_strings)
return text_content
return modify_and_get_description(soup)
@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:

View File

@ -4,23 +4,36 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn.
"""
import time
import random
from typing import Optional
from datetime import datetime
import requests
import time
from requests.exceptions import ProxyError
from bs4 import BeautifulSoup
from bs4.element import Tag
from threading import Lock
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException
from ..utils import create_session
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ...jobs import (
JobPost,
Location,
JobResponse,
JobType,
Country,
Compensation
)
from ..utils import (
count_urgent_words,
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
modify_and_get_description
)
class LinkedInScraper(Scraper):
@ -213,7 +226,7 @@ class LinkedInScraper(Scraper):
description = None
if div_content:
description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip())
description = modify_and_get_description(div_content)
def get_job_type(
soup_job_type: BeautifulSoup,

View File

@ -8,6 +8,15 @@ from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType
def modify_and_get_description(soup):
for li in soup.find_all('li'):
li.string = "- " + li.get_text()
description = soup.get_text(separator='\n').strip()
description = re.sub(r'\n+', '\n', description)
return description
def count_urgent_words(description: str) -> int:
"""
Count the number of urgent words or phrases in a job description.

View File

@ -10,14 +10,13 @@ import re
from datetime import datetime, date
from typing import Optional, Tuple, Any
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
class ZipRecruiterScraper(Scraper):
@ -107,9 +106,9 @@ class ZipRecruiterScraper(Scraper):
title = job.get("name")
job_url = job.get("job_url")
description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser"
).get_text(separator="\n")
job_description_html = job.get("job_description", "").strip()
description_soup = BeautifulSoup(job_description_html, "html.parser")
description = modify_and_get_description(description_soup)
company = job["hiring_company"].get("name") if "hiring_company" in job else None
country_value = "usa" if job.get("job_country") == "US" else "canada"