diff --git a/jobspy/__init__.py b/jobspy/__init__.py index 6b34c3e..76129e8 100644 --- a/jobspy/__init__.py +++ b/jobspy/__init__.py @@ -107,6 +107,7 @@ def scrape_jobs( scraped_data: JobResponse = scraper.scrape(scraper_input) cap_name = site.value.capitalize() site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name + site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name create_logger(site_name).info(f"finished scraping") return site.value, scraped_data diff --git a/jobspy/linkedin/__init__.py b/jobspy/linkedin/__init__.py index a164732..5f7e1b4 100644 --- a/jobspy/linkedin/__init__.py +++ b/jobspy/linkedin/__init__.py @@ -35,6 +35,7 @@ from jobspy.util import ( extract_emails_from_text, currency_parser, markdown_converter, + plain_converter, create_session, remove_attributes, create_logger, @@ -267,7 +268,8 @@ class LinkedIn(Scraper): description = div_content.prettify(formatter="html") if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: description = markdown_converter(description) - + elif self.scraper_input.description_format == DescriptionFormat.PLAIN: + description = plain_converter(description) h3_tag = soup.find( "h3", text=lambda text: text and "Job function" in text.strip() ) diff --git a/jobspy/model.py b/jobspy/model.py index 3fb34d4..9dba826 100644 --- a/jobspy/model.py +++ b/jobspy/model.py @@ -234,7 +234,7 @@ class Compensation(BaseModel): class DescriptionFormat(Enum): MARKDOWN = "markdown" HTML = "html" - + PLAIN = "plain" class JobPost(BaseModel): id: str | None = None diff --git a/jobspy/util.py b/jobspy/util.py index da0e372..ae17f65 100644 --- a/jobspy/util.py +++ b/jobspy/util.py @@ -157,6 +157,15 @@ def markdown_converter(description_html: str): markdown = md(description_html) return markdown.strip() +def plain_converter(decription_html:str): + from bs4 import BeautifulSoup + if decription_html is None: + return None + soup = BeautifulSoup(decription_html, "html.parser") + text = soup.get_text(separator=" ") + text = re.sub(r'\s+',' ',text) + return text.strip() + def extract_emails_from_text(text: str) -> list[str] | None: if not text: