feat: add naukri.com support (#259)

2025-03-22 03:53:07 +05:30 · 2025-03-22 03:53:07 +05:30 · 0946cb3373
parent 051981689f
commit 0946cb3373
8 changed files with 395 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@

 ## Features

- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
+- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
 - Aggregates the job postings in a dataframe
 - Proxies support to bypass blocking

@ -25,7 +25,7 @@ import csv
 from jobspy import scrape_jobs

 jobs = scrape_jobs(
-    site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
+    site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
    search_term="software engineer",
    google_search_term="software engineer jobs near San Francisco, CA since yesterday",
    location="San Francisco, CA",
@ -51,6 +51,7 @@ linkedin       Software Engineer - Early Career  Lockheed Martin   Sunnyvale
 linkedin       Full-Stack Software Engineer      Rain              New York      NY     fulltime  yearly    None        None        https://www.linkedin.com/jobs/view/3696158877      Rain’s mission is to create the fastest and ea...
 zip_recruiter Software Engineer - New Grad       ZipRecruiter      Santa Monica  CA     fulltime  yearly    130000      150000      https://www.ziprecruiter.com/jobs/ziprecruiter...  We offer a hybrid work environment. Most US-ba...
 zip_recruiter Software Developer                 TEKsystems        Phoenix       AZ     fulltime  hourly    65          75          https://www.ziprecruiter.com/jobs/teksystems-0...  Top Skills' Details• 6 years of Java developme...
+
 ```

 ### Parameters for `scrape_jobs()`
@ -245,4 +246,12 @@ Indeed specific
 ├── company_revenue_label
 ├── company_description
 └── company_logo
+
+Naukri specific
+├── skills
+├── experience_range
+├── company_rating
+├── company_reviews_count
+├── vacancy_count
+└── work_from_home_type
 ```
--- a/jobspy/init.py
+++ b/jobspy/init.py
@ -10,6 +10,7 @@ from jobspy.glassdoor import Glassdoor
 from jobspy.google import Google
 from jobspy.indeed import Indeed
 from jobspy.linkedin import LinkedIn
+from jobspy.naukri import Naukri
 from jobspy.model import JobType, Location, JobResponse, Country
 from jobspy.model import SalarySource, ScraperInput, Site
 from jobspy.util import (
@ -57,6 +58,7 @@ def scrape_jobs(
        Site.GLASSDOOR: Glassdoor,
        Site.GOOGLE: Google,
        Site.BAYT: BaytScraper,
+        Site.NAUKRI: Naukri,
    }
    set_logger_level(verbose)
    job_type = get_enum_from_value(job_type) if job_type else None
@ -139,6 +141,7 @@ def scrape_jobs(
                    **job_data["location"]
                ).display_location()

+            # Handle compensation
            compensation_obj = job_data.get("compensation")
            if compensation_obj and isinstance(compensation_obj, dict):
                job_data["interval"] = (
@ -157,7 +160,6 @@ def scrape_jobs(
                    and job_data["max_amount"]
                ):
                    convert_to_annual(job_data)
-
            else:
                if country_enum == Country.USA:
                    (
@ -176,6 +178,17 @@ def scrape_jobs(
                if "min_amount" in job_data and job_data["min_amount"]
                else None
            )
+
+            #naukri-specific fields
+            job_data["skills"] = (
+                ", ".join(job_data["skills"]) if job_data["skills"] else None
+            )
+            job_data["experience_range"] = job_data.get("experience_range")
+            job_data["company_rating"] = job_data.get("company_rating")
+            job_data["company_reviews_count"] = job_data.get("company_reviews_count")
+            job_data["vacancy_count"] = job_data.get("vacancy_count")
+            job_data["work_from_home_type"] = job_data.get("work_from_home_type")
+
            job_df = pd.DataFrame([job_data])
            jobs_dfs.append(job_df)

@ -199,4 +212,4 @@ def scrape_jobs(
            by=["site", "date_posted"], ascending=[True, False]
        ).reset_index(drop=True)
    else:
-        return pd.DataFrame()
+        return pd.DataFrame()
--- a/jobspy/exception.py
+++ b/jobspy/exception.py
@ -34,3 +34,7 @@ class GoogleJobsException(Exception):
 class BaytException(Exception):
    def __init__(self, message=None):
        super().__init__(message or "An error occurred with Bayt")
+
+class NaukriException(Exception):
+    def __init__(self,message=None):
+        super().__init__(message or "An error occurred with Naukri")
--- a/jobspy/model.py
+++ b/jobspy/model.py
@ -254,13 +254,13 @@ class JobPost(BaseModel):
    is_remote: bool | None = None
    listing_type: str | None = None

-    # linkedin specific
+    # LinkedIn specific
    job_level: str | None = None

-    # linkedin and indeed specific
+    # LinkedIn and Indeed specific
    company_industry: str | None = None

-    # indeed specific
+    # Indeed specific
    company_addresses: str | None = None
    company_num_employees: str | None = None
    company_revenue: str | None = None
@ -268,9 +268,16 @@ class JobPost(BaseModel):
    company_logo: str | None = None
    banner_photo_url: str | None = None

-    # linkedin only atm
+    # LinkedIn only atm
    job_function: str | None = None

+    # Naukri specific
+    skills: list[str] | None = None  #from tagsAndSkills
+    experience_range: str | None = None  #from experienceText
+    company_rating: float | None = None  #from ambitionBoxData.AggregateRating
+    company_reviews_count: int | None = None  #from ambitionBoxData.ReviewsCount
+    vacancy_count: int | None = None  #from vacancy
+    work_from_home_type: str | None = None  #from clusters.wfhType (e.g., "Hybrid", "Remote")

 class JobResponse(BaseModel):
    jobs: list[JobPost] = []
@ -283,6 +290,7 @@ class Site(Enum):
    GLASSDOOR = "glassdoor"
    GOOGLE = "google"
    BAYT = "bayt"
+    NAUKRI = "naukri"


 class SalarySource(Enum):
--- a/jobspy/naukri/init.py
+++ b/jobspy/naukri/init.py
@ -0,0 +1,301 @@
+from __future__ import annotations
+
+import math
+import random
+import time
+from datetime import datetime, date
+from typing import Optional
+
+import regex as re
+import requests
+
+from jobspy.exception import NaukriException
+from jobspy.naukri.constant import headers as naukri_headers
+from jobspy.naukri.util import (
+    is_job_remote,
+    parse_job_type,
+    parse_company_industry,
+)
+from jobspy.model import (
+    JobPost,
+    Location,
+    JobResponse,
+    Country,
+    Compensation,
+    DescriptionFormat,
+    Scraper,
+    ScraperInput,
+    Site,
+)
+from jobspy.util import (
+    extract_emails_from_text,
+    currency_parser,
+    markdown_converter,
+    create_session,
+    create_logger,
+)
+
+log = create_logger("Naukri")
+
+class Naukri(Scraper):
+    base_url = "https://www.naukri.com/jobapi/v3/search"
+    delay = 3
+    band_delay = 4
+    jobs_per_page = 20  
+
+    def __init__(
+        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
+    ):
+        """
+        Initializes NaukriScraper with the Naukri API URL
+        """
+        super().__init__(Site.NAUKRI, proxies=proxies, ca_cert=ca_cert)
+        self.session = create_session(
+            proxies=self.proxies,
+            ca_cert=ca_cert,
+            is_tls=False,
+            has_retry=True,
+            delay=5,
+            clear_cookies=True,
+        )
+        self.session.headers.update(naukri_headers)
+        self.scraper_input = None
+        self.country = "India"  #naukri is india-focused by default
+        log.info("Naukri scraper initialized")
+
+    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+        """
+        Scrapes Naukri API for jobs with scraper_input criteria
+        :param scraper_input:
+        :return: job_response
+        """
+        self.scraper_input = scraper_input
+        job_list: list[JobPost] = []
+        seen_ids = set()
+        start = scraper_input.offset or 0
+        page = (start // self.jobs_per_page) + 1
+        request_count = 0
+        seconds_old = (
+            scraper_input.hours_old * 3600 if scraper_input.hours_old else None
+        )
+        continue_search = (
+            lambda: len(job_list) < scraper_input.results_wanted and page <= 50  # Arbitrary limit
+        )
+
+        while continue_search():
+            request_count += 1
+            log.info(
+                f"Scraping page {request_count} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)} "
+                f"for search term: {scraper_input.search_term}"
+            )
+            params = {
+                "noOfResults": self.jobs_per_page,
+                "urlType": "search_by_keyword",
+                "searchType": "adv",
+                "keyword": scraper_input.search_term,
+                "pageNo": page,
+                "k": scraper_input.search_term,
+                "seoKey": f"{scraper_input.search_term.lower().replace(' ', '-')}-jobs",
+                "src": "jobsearchDesk",
+                "latLong": "",
+                "location": scraper_input.location,
+                "remote": "true" if scraper_input.is_remote else None,
+            }
+            if seconds_old:
+                params["days"] = seconds_old // 86400  # Convert to days
+
+            params = {k: v for k, v in params.items() if v is not None}
+            try:
+                log.debug(f"Sending request to {self.base_url} with params: {params}")
+                response = self.session.get(self.base_url, params=params, timeout=10)
+                if response.status_code not in range(200, 400):
+                    err = f"Naukri API response status code {response.status_code} - {response.text}"
+                    log.error(err)
+                    return JobResponse(jobs=job_list)
+                data = response.json()
+                job_details = data.get("jobDetails", [])
+                log.info(f"Received {len(job_details)} job entries from API")
+                if not job_details:
+                    log.warning("No job details found in API response")
+                    break
+            except Exception as e:
+                log.error(f"Naukri API request failed: {str(e)}")
+                return JobResponse(jobs=job_list)
+
+            for job in job_details:
+                job_id = job.get("jobId")
+                if not job_id or job_id in seen_ids:
+                    continue
+                seen_ids.add(job_id)
+                log.debug(f"Processing job ID: {job_id}")
+
+                try:
+                    fetch_desc = scraper_input.linkedin_fetch_description
+                    job_post = self._process_job(job, job_id, fetch_desc)
+                    if job_post:
+                        job_list.append(job_post)
+                        log.info(f"Added job: {job_post.title} (ID: {job_id})")
+                    if not continue_search():
+                        break
+                except Exception as e:
+                    log.error(f"Error processing job ID {job_id}: {str(e)}")
+                    raise NaukriException(str(e))
+
+            if continue_search():
+                time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
+                page += 1
+
+        job_list = job_list[:scraper_input.results_wanted]
+        log.info(f"Scraping completed. Total jobs collected: {len(job_list)}")
+        return JobResponse(jobs=job_list)
+
+    def _process_job(
+        self, job: dict, job_id: str, full_descr: bool
+    ) -> Optional[JobPost]:
+        """
+        Processes a single job from API response into a JobPost object
+        """
+        title = job.get("title", "N/A")
+        company = job.get("companyName", "N/A")
+        company_url = f"https://www.naukri.com/{job.get('staticUrl', '')}" if job.get("staticUrl") else None
+
+        location = self._get_location(job.get("placeholders", []))
+        compensation = self._get_compensation(job.get("placeholders", []))
+        date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
+
+        job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
+        description = job.get("jobDescription") if full_descr else None
+        if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
+            description = markdown_converter(description)
+
+        job_type = parse_job_type(description) if description else None
+        company_industry = parse_company_industry(description) if description else None
+        is_remote = is_job_remote(title, description or "", location)
+        company_logo = job.get("logoPathV3") or job.get("logoPath")
+
+        # Naukri-specific fields
+        skills = job.get("tagsAndSkills", "").split(",") if job.get("tagsAndSkills") else None
+        experience_range = job.get("experienceText")
+        ambition_box = job.get("ambitionBoxData", {})
+        company_rating = float(ambition_box.get("AggregateRating")) if ambition_box.get("AggregateRating") else None
+        company_reviews_count = ambition_box.get("ReviewsCount")
+        vacancy_count = job.get("vacancy")
+        work_from_home_type = self._infer_work_from_home_type(job.get("placeholders", []), title, description or "")
+
+        job_post = JobPost(
+            id=f"nk-{job_id}",
+            title=title,
+            company_name=company,
+            company_url=company_url,
+            location=location,
+            is_remote=is_remote,
+            date_posted=date_posted,
+            job_url=job_url,
+            compensation=compensation,
+            job_type=job_type,
+            company_industry=company_industry,
+            description=description,
+            emails=extract_emails_from_text(description or ""),
+            company_logo=company_logo,
+            skills=skills,
+            experience_range=experience_range,
+            company_rating=company_rating,
+            company_reviews_count=company_reviews_count,
+            vacancy_count=vacancy_count,
+            work_from_home_type=work_from_home_type,
+        )
+        log.debug(f"Processed job: {title} at {company}")
+        return job_post
+
+    def _get_location(self, placeholders: list[dict]) -> Location:
+        """
+        Extracts location data from placeholders
+        """
+        location = Location(country=Country.INDIA)
+        for placeholder in placeholders:
+            if placeholder.get("type") == "location":
+                location_str = placeholder.get("label", "")
+                parts = location_str.split(", ")
+                city = parts[0] if parts else None
+                state = parts[1] if len(parts) > 1 else None
+                location = Location(city=city, state=state, country=Country.INDIA)
+                log.debug(f"Parsed location: {location.display_location()}")
+                break
+        return location
+
+    def _get_compensation(self, placeholders: list[dict]) -> Optional[Compensation]:
+        """
+        Extracts compensation data from placeholders, handling Indian salary formats (Lakhs, Crores)
+        """
+        for placeholder in placeholders:
+            if placeholder.get("type") == "salary":
+                salary_text = placeholder.get("label", "").strip()
+                if salary_text == "Not disclosed":
+                    log.debug("Salary not disclosed")
+                    return None
+
+                # Handle Indian salary formats (e.g., "12-16 Lacs P.A.", "1-5 Cr")
+                salary_match = re.match(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*(Lacs|Lakh|Cr)\s*(P\.A\.)?", salary_text, re.IGNORECASE)
+                if salary_match:
+                    min_salary, max_salary, unit = salary_match.groups()[:3]
+                    min_salary, max_salary = float(min_salary), float(max_salary)
+                    currency = "INR"
+
+                    # Convert to base units (INR)
+                    if unit.lower() in ("lacs", "lakh"):
+                        min_salary *= 100000  # 1 Lakh = 100,000 INR
+                        max_salary *= 100000
+                    elif unit.lower() == "cr":
+                        min_salary *= 10000000  # 1 Crore = 10,000,000 INR
+                        max_salary *= 10000000
+
+                    log.debug(f"Parsed salary: {min_salary} - {max_salary} INR")
+                    return Compensation(
+                        min_amount=int(min_salary),
+                        max_amount=int(max_salary),
+                        currency=currency,
+                    )
+                else:
+                    log.debug(f"Could not parse salary: {salary_text}")
+                    return None
+        return None
+
+    def _parse_date(self, label: str, created_date: int) -> Optional[date]:
+        """
+        Parses date from footerPlaceholderLabel or createdDate, returning a date object
+        """
+        today = datetime.now()
+        if not label:
+            if created_date:
+                return datetime.fromtimestamp(created_date / 1000).date()  # Convert to date
+            return None
+        label = label.lower()
+        if "today" in label or "just now" in label or "few hours" in label:
+            log.debug("Date parsed as today")
+            return today.date()
+        elif "ago" in label:
+            match = re.search(r"(\d+)\s*day", label)
+            if match:
+                days = int(match.group(1))
+                parsed_date = today.replace(day=today.day - days).date()
+                log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
+                return parsed_date
+        elif created_date:
+            parsed_date = datetime.fromtimestamp(created_date / 1000).date()
+            log.debug(f"Date parsed from timestamp: {parsed_date}")
+            return parsed_date
+        log.debug("No date parsed")
+        return None
+
+    def _infer_work_from_home_type(self, placeholders: list[dict], title: str, description: str) -> Optional[str]:
+        """
+        Infers work-from-home type from job data (e.g., 'Hybrid', 'Remote', 'Work from office')
+        """
+        location_str = next((p["label"] for p in placeholders if p["type"] == "location"), "").lower()
+        if "hybrid" in location_str or "hybrid" in title.lower() or "hybrid" in description.lower():
+            return "Hybrid"
+        elif "remote" in location_str or "remote" in title.lower() or "remote" in description.lower():
+            return "Remote"
+        elif "work from office" in description.lower() or not ("remote" in description.lower() or "hybrid" in description.lower()):
+            return "Work from office"
+        return None
--- a/jobspy/naukri/constant.py
+++ b/jobspy/naukri/constant.py
@ -0,0 +1,11 @@
+headers = {
+    "authority": "www.naukri.com",
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "en-US,en;q=0.9",
+    "cache-control": "max-age=0",
+    "upgrade-insecure-requests": "1",
+    "appid": "109",
+    "systemid": "Naukri",
+    "Nkparam": "Ppy0YK9uSHqPtG3bEejYc04RTpUN2CjJOrqA68tzQt0SKJHXZKzz9M8cZtKLVkoOuQmfe4cTb1r2CwfHaxW5Tg==",
+    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+}
--- a/jobspy/naukri/util.py
+++ b/jobspy/naukri/util.py
@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from bs4 import BeautifulSoup
+from jobspy.model import JobType, Location
+from jobspy.util import get_enum_from_job_type
+
+
+def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
+    """
+    Gets the job type from the job page
+    """
+    job_type_tag = soup.find("span", class_="job-type")
+    if job_type_tag:
+        job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
+        return [get_enum_from_job_type(job_type_str)] if job_type_str else None
+    return None
+
+
+def parse_company_industry(soup: BeautifulSoup) -> str | None:
+    """
+    Gets the company industry from the job page
+    """
+    industry_tag = soup.find("span", class_="industry")
+    return industry_tag.get_text(strip=True) if industry_tag else None
+
+
+def is_job_remote(title: str, description: str, location: Location) -> bool:
+    """
+    Searches the title, description, and location to check if the job is remote
+    """
+    remote_keywords = ["remote", "work from home", "wfh"]
+    location_str = location.display_location()
+    full_string = f"{title} {description} {location_str}".lower()
+    return any(keyword in full_string for keyword in remote_keywords)
--- a/jobspy/util.py
+++ b/jobspy/util.py
@ -344,4 +344,11 @@ desired_order = [
    "company_num_employees",
    "company_revenue",
    "company_description",
+    #naukri-specific fields
+    "skills",
+    "experience_range",
+    "company_rating",
+    "company_reviews_count",
+    "vacancy_count",
+    "work_from_home_type",
 ]