mirror of https://github.com/Bunsly/JobSpy
parent
5b3627b244
commit
b97c73ffd6
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.35"
|
||||
version = "1.1.36"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Union, Optional
|
||||
from typing import Optional
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, validator
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class JobType(Enum):
|
||||
|
|
|
@ -14,7 +14,7 @@ from ..utils import count_urgent_words, extract_emails_from_text
|
|||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import GlassdoorException
|
||||
from ..utils import create_session
|
||||
from ..utils import create_session, modify_and_get_description
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
|
@ -200,9 +200,7 @@ class GlassdoorScraper(Scraper):
|
|||
data = response.json()[0]
|
||||
desc = data['data']['jobview']['job']['description']
|
||||
soup = BeautifulSoup(desc, 'html.parser')
|
||||
description = soup.get_text(separator='\n')
|
||||
|
||||
return description
|
||||
return modify_and_get_description(soup)
|
||||
|
||||
@staticmethod
|
||||
def parse_compensation(data: dict) -> Optional[Compensation]:
|
||||
|
@ -292,12 +290,11 @@ class GlassdoorScraper(Scraper):
|
|||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def parse_location(location_name: str) -> Location:
|
||||
def parse_location(location_name: str) -> Location | None:
|
||||
if not location_name or location_name == "Remote":
|
||||
return None
|
||||
return
|
||||
city, _, state = location_name.partition(", ")
|
||||
return Location(city=city, state=state)
|
||||
|
||||
|
@ -306,7 +303,6 @@ class GlassdoorScraper(Scraper):
|
|||
for cursor_data in pagination_cursors:
|
||||
if cursor_data["pageNumber"] == page_num:
|
||||
return cursor_data["cursor"]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def headers() -> dict:
|
||||
|
|
|
@ -21,6 +21,7 @@ from ..utils import (
|
|||
extract_emails_from_text,
|
||||
create_session,
|
||||
get_enum_from_job_type,
|
||||
modify_and_get_description
|
||||
)
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
|
@ -247,9 +248,7 @@ class IndeedScraper(Scraper):
|
|||
return None
|
||||
|
||||
soup = BeautifulSoup(job_description, "html.parser")
|
||||
text_content = "\n".join(soup.stripped_strings)
|
||||
|
||||
return text_content
|
||||
return modify_and_get_description(soup)
|
||||
|
||||
@staticmethod
|
||||
def get_job_type(job: dict) -> list[JobType] | None:
|
||||
|
|
|
@ -4,23 +4,36 @@ jobspy.scrapers.linkedin
|
|||
|
||||
This module contains routines to scrape LinkedIn.
|
||||
"""
|
||||
import time
|
||||
import random
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
import time
|
||||
from requests.exceptions import ProxyError
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from threading import Lock
|
||||
from bs4.element import Tag
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import LinkedInException
|
||||
from ..utils import create_session
|
||||
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
Country,
|
||||
Compensation
|
||||
)
|
||||
from ..utils import (
|
||||
count_urgent_words,
|
||||
extract_emails_from_text,
|
||||
get_enum_from_job_type,
|
||||
currency_parser,
|
||||
modify_and_get_description
|
||||
)
|
||||
|
||||
|
||||
class LinkedInScraper(Scraper):
|
||||
|
@ -213,7 +226,7 @@ class LinkedInScraper(Scraper):
|
|||
|
||||
description = None
|
||||
if div_content:
|
||||
description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip())
|
||||
description = modify_and_get_description(div_content)
|
||||
|
||||
def get_job_type(
|
||||
soup_job_type: BeautifulSoup,
|
||||
|
|
|
@ -8,6 +8,15 @@ from requests.adapters import HTTPAdapter, Retry
|
|||
from ..jobs import JobType
|
||||
|
||||
|
||||
def modify_and_get_description(soup):
|
||||
for li in soup.find_all('li'):
|
||||
li.string = "- " + li.get_text()
|
||||
|
||||
description = soup.get_text(separator='\n').strip()
|
||||
description = re.sub(r'\n+', '\n', description)
|
||||
return description
|
||||
|
||||
|
||||
def count_urgent_words(description: str) -> int:
|
||||
"""
|
||||
Count the number of urgent words or phrases in a job description.
|
||||
|
|
|
@ -10,14 +10,13 @@ import re
|
|||
from datetime import datetime, date
|
||||
from typing import Optional, Tuple, Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import ZipRecruiterException
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
||||
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
|
@ -107,9 +106,9 @@ class ZipRecruiterScraper(Scraper):
|
|||
title = job.get("name")
|
||||
job_url = job.get("job_url")
|
||||
|
||||
description = BeautifulSoup(
|
||||
job.get("job_description", "").strip(), "html.parser"
|
||||
).get_text(separator="\n")
|
||||
job_description_html = job.get("job_description", "").strip()
|
||||
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
||||
description = modify_and_get_description(description_soup)
|
||||
|
||||
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||
|
|
Loading…
Reference in New Issue