fix:correct LinkedIn logger naming (#291)

* fix:correct LinkedIn logger naming

* add:linkedin description plain format
This commit is contained in:
Lixian Wang
2025-08-23 22:37:49 +02:00
committed by GitHub
parent 84ed670df3
commit b6d5cd8d79
4 changed files with 14 additions and 2 deletions

View File

@@ -107,6 +107,7 @@ def scrape_jobs(
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name
create_logger(site_name).info(f"finished scraping") create_logger(site_name).info(f"finished scraping")
return site.value, scraped_data return site.value, scraped_data

View File

@@ -35,6 +35,7 @@ from jobspy.util import (
extract_emails_from_text, extract_emails_from_text,
currency_parser, currency_parser,
markdown_converter, markdown_converter,
plain_converter,
create_session, create_session,
remove_attributes, remove_attributes,
create_logger, create_logger,
@@ -267,7 +268,8 @@ class LinkedIn(Scraper):
description = div_content.prettify(formatter="html") description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description) description = markdown_converter(description)
elif self.scraper_input.description_format == DescriptionFormat.PLAIN:
description = plain_converter(description)
h3_tag = soup.find( h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip() "h3", text=lambda text: text and "Job function" in text.strip()
) )

View File

@@ -234,7 +234,7 @@ class Compensation(BaseModel):
class DescriptionFormat(Enum): class DescriptionFormat(Enum):
MARKDOWN = "markdown" MARKDOWN = "markdown"
HTML = "html" HTML = "html"
PLAIN = "plain"
class JobPost(BaseModel): class JobPost(BaseModel):
id: str | None = None id: str | None = None

View File

@@ -157,6 +157,15 @@ def markdown_converter(description_html: str):
markdown = md(description_html) markdown = md(description_html)
return markdown.strip() return markdown.strip()
def plain_converter(decription_html:str):
from bs4 import BeautifulSoup
if decription_html is None:
return None
soup = BeautifulSoup(decription_html, "html.parser")
text = soup.get_text(separator=" ")
text = re.sub(r'\s+',' ',text)
return text.strip()
def extract_emails_from_text(text: str) -> list[str] | None: def extract_emails_from_text(text: str) -> list[str] | None:
if not text: if not text: