From 6e8576f8a8f5e28762399ac1236d81f260fa1331 Mon Sep 17 00:00:00 2001 From: Kaushik H S Date: Sun, 24 Aug 2025 02:08:26 +0530 Subject: [PATCH] fix(naukri): prevent str.find error by normalizing input and parsing before Markdown (#300) --- jobspy/naukri/__init__.py | 9 ++++++--- jobspy/naukri/util.py | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/jobspy/naukri/__init__.py b/jobspy/naukri/__init__.py index d456c85..43d4828 100644 --- a/jobspy/naukri/__init__.py +++ b/jobspy/naukri/__init__.py @@ -164,12 +164,15 @@ class Naukri(Scraper): date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate")) job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}" - description = job.get("jobDescription") if full_descr else None + raw_description = job.get("jobDescription") if full_descr else None + + job_type = parse_job_type(raw_description) if raw_description else None + company_industry = parse_company_industry(raw_description) if raw_description else None + + description = raw_description if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN: description = markdown_converter(description) - job_type = parse_job_type(description) if description else None - company_industry = parse_company_industry(description) if description else None is_remote = is_job_remote(title, description or "", location) company_logo = job.get("logoPathV3") or job.get("logoPath") diff --git a/jobspy/naukri/util.py b/jobspy/naukri/util.py index f363c9a..c1b0d7a 100644 --- a/jobspy/naukri/util.py +++ b/jobspy/naukri/util.py @@ -5,10 +5,12 @@ from jobspy.model import JobType, Location from jobspy.util import get_enum_from_job_type -def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None: +def parse_job_type(soup: BeautifulSoup |str) -> list[JobType] | None: """ Gets the job type from the job page """ + if isinstance(soup, str): + soup = BeautifulSoup(soup, "html.parser") job_type_tag = soup.find("span", class_="job-type") if job_type_tag: job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "") @@ -16,10 +18,12 @@ def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None: return None -def parse_company_industry(soup: BeautifulSoup) -> str | None: +def parse_company_industry(soup: BeautifulSoup | str) -> str | None: """ Gets the company industry from the job page """ + if isinstance(soup, str): + soup = BeautifulSoup(soup, "html.parser") industry_tag = soup.find("span", class_="industry") return industry_tag.get_text(strip=True) if industry_tag else None