mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 11:34:47 -08:00
fix:correct LinkedIn logger naming (#291)
* fix:correct LinkedIn logger naming * add:linkedin description plain format
This commit is contained in:
@@ -107,6 +107,7 @@ def scrape_jobs(
|
|||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
cap_name = site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
|
site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name
|
||||||
create_logger(site_name).info(f"finished scraping")
|
create_logger(site_name).info(f"finished scraping")
|
||||||
return site.value, scraped_data
|
return site.value, scraped_data
|
||||||
|
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ from jobspy.util import (
|
|||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
currency_parser,
|
currency_parser,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
|
plain_converter,
|
||||||
create_session,
|
create_session,
|
||||||
remove_attributes,
|
remove_attributes,
|
||||||
create_logger,
|
create_logger,
|
||||||
@@ -267,7 +268,8 @@ class LinkedIn(Scraper):
|
|||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
|
elif self.scraper_input.description_format == DescriptionFormat.PLAIN:
|
||||||
|
description = plain_converter(description)
|
||||||
h3_tag = soup.find(
|
h3_tag = soup.find(
|
||||||
"h3", text=lambda text: text and "Job function" in text.strip()
|
"h3", text=lambda text: text and "Job function" in text.strip()
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -234,7 +234,7 @@ class Compensation(BaseModel):
|
|||||||
class DescriptionFormat(Enum):
|
class DescriptionFormat(Enum):
|
||||||
MARKDOWN = "markdown"
|
MARKDOWN = "markdown"
|
||||||
HTML = "html"
|
HTML = "html"
|
||||||
|
PLAIN = "plain"
|
||||||
|
|
||||||
class JobPost(BaseModel):
|
class JobPost(BaseModel):
|
||||||
id: str | None = None
|
id: str | None = None
|
||||||
|
|||||||
@@ -157,6 +157,15 @@ def markdown_converter(description_html: str):
|
|||||||
markdown = md(description_html)
|
markdown = md(description_html)
|
||||||
return markdown.strip()
|
return markdown.strip()
|
||||||
|
|
||||||
|
def plain_converter(decription_html:str):
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
if decription_html is None:
|
||||||
|
return None
|
||||||
|
soup = BeautifulSoup(decription_html, "html.parser")
|
||||||
|
text = soup.get_text(separator=" ")
|
||||||
|
text = re.sub(r'\s+',' ',text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def extract_emails_from_text(text: str) -> list[str] | None:
|
def extract_emails_from_text(text: str) -> list[str] | None:
|
||||||
if not text:
|
if not text:
|
||||||
|
|||||||
Reference in New Issue
Block a user