mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 11:34:47 -08:00
fix(naukri): prevent str.find error by normalizing input and parsing before Markdown (#300)
This commit is contained in:
@@ -164,12 +164,15 @@ class Naukri(Scraper):
|
|||||||
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
|
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
|
||||||
|
|
||||||
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
|
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
|
||||||
description = job.get("jobDescription") if full_descr else None
|
raw_description = job.get("jobDescription") if full_descr else None
|
||||||
|
|
||||||
|
job_type = parse_job_type(raw_description) if raw_description else None
|
||||||
|
company_industry = parse_company_industry(raw_description) if raw_description else None
|
||||||
|
|
||||||
|
description = raw_description
|
||||||
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
|
|
||||||
job_type = parse_job_type(description) if description else None
|
|
||||||
company_industry = parse_company_industry(description) if description else None
|
|
||||||
is_remote = is_job_remote(title, description or "", location)
|
is_remote = is_job_remote(title, description or "", location)
|
||||||
company_logo = job.get("logoPathV3") or job.get("logoPath")
|
company_logo = job.get("logoPathV3") or job.get("logoPath")
|
||||||
|
|
||||||
|
|||||||
@@ -5,10 +5,12 @@ from jobspy.model import JobType, Location
|
|||||||
from jobspy.util import get_enum_from_job_type
|
from jobspy.util import get_enum_from_job_type
|
||||||
|
|
||||||
|
|
||||||
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
|
def parse_job_type(soup: BeautifulSoup |str) -> list[JobType] | None:
|
||||||
"""
|
"""
|
||||||
Gets the job type from the job page
|
Gets the job type from the job page
|
||||||
"""
|
"""
|
||||||
|
if isinstance(soup, str):
|
||||||
|
soup = BeautifulSoup(soup, "html.parser")
|
||||||
job_type_tag = soup.find("span", class_="job-type")
|
job_type_tag = soup.find("span", class_="job-type")
|
||||||
if job_type_tag:
|
if job_type_tag:
|
||||||
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
|
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
|
||||||
@@ -16,10 +18,12 @@ def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def parse_company_industry(soup: BeautifulSoup) -> str | None:
|
def parse_company_industry(soup: BeautifulSoup | str) -> str | None:
|
||||||
"""
|
"""
|
||||||
Gets the company industry from the job page
|
Gets the company industry from the job page
|
||||||
"""
|
"""
|
||||||
|
if isinstance(soup, str):
|
||||||
|
soup = BeautifulSoup(soup, "html.parser")
|
||||||
industry_tag = soup.find("span", class_="industry")
|
industry_tag = soup.find("span", class_="industry")
|
||||||
return industry_tag.get_text(strip=True) if industry_tag else None
|
return industry_tag.get_text(strip=True) if industry_tag else None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user