mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
fix:correct LinkedIn logger naming (#291)
* fix:correct LinkedIn logger naming * add:linkedin description plain format
This commit is contained in:
@@ -107,6 +107,7 @@ def scrape_jobs(
|
||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||
cap_name = site.value.capitalize()
|
||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||
site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name
|
||||
create_logger(site_name).info(f"finished scraping")
|
||||
return site.value, scraped_data
|
||||
|
||||
|
||||
@@ -35,6 +35,7 @@ from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
currency_parser,
|
||||
markdown_converter,
|
||||
plain_converter,
|
||||
create_session,
|
||||
remove_attributes,
|
||||
create_logger,
|
||||
@@ -267,7 +268,8 @@ class LinkedIn(Scraper):
|
||||
description = div_content.prettify(formatter="html")
|
||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
|
||||
elif self.scraper_input.description_format == DescriptionFormat.PLAIN:
|
||||
description = plain_converter(description)
|
||||
h3_tag = soup.find(
|
||||
"h3", text=lambda text: text and "Job function" in text.strip()
|
||||
)
|
||||
|
||||
@@ -234,7 +234,7 @@ class Compensation(BaseModel):
|
||||
class DescriptionFormat(Enum):
|
||||
MARKDOWN = "markdown"
|
||||
HTML = "html"
|
||||
|
||||
PLAIN = "plain"
|
||||
|
||||
class JobPost(BaseModel):
|
||||
id: str | None = None
|
||||
|
||||
@@ -157,6 +157,15 @@ def markdown_converter(description_html: str):
|
||||
markdown = md(description_html)
|
||||
return markdown.strip()
|
||||
|
||||
def plain_converter(decription_html:str):
|
||||
from bs4 import BeautifulSoup
|
||||
if decription_html is None:
|
||||
return None
|
||||
soup = BeautifulSoup(decription_html, "html.parser")
|
||||
text = soup.get_text(separator=" ")
|
||||
text = re.sub(r'\s+',' ',text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def extract_emails_from_text(text: str) -> list[str] | None:
|
||||
if not text:
|
||||
|
||||
Reference in New Issue
Block a user