Compare commits

..

4 Commits

Author SHA1 Message Date
Cullen Watson
d4d52d05f5 chore:version 2025-03-21 17:35:23 -05:00
Liju Thomas
0946cb3373 feat: add naukri.com support (#259) 2025-03-21 17:23:07 -05:00
prudvisorra-aifa
051981689f Update util.py (#256) 2025-03-17 11:51:19 -05:00
Cullen Watson
903b7e6f1b fix(linkedin):is remote 2025-03-06 13:38:28 -06:00
26 changed files with 502 additions and 1924 deletions

View File

@@ -1,49 +0,0 @@
name: JobSpy Scraper Dynamic Workflow
on:
workflow_dispatch:
inputs:
user_email:
description: 'Email of user'
required: true
default: 'Branden@autoemployme.onmicrosoft.com'
permissions:
contents: read
id-token: write
jobs:
scrape_jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Set Up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install Dependencies
run: |
pip install --upgrade pip
pip install -r requirements.txt
- name: Sanitize Email + Create Run ID
id: vars
run: |
safe_email=$(echo "${{ github.event.inputs.user_email }}" | sed 's/@/_at_/g; s/\./_/g')
run_id=$(date +%s)
echo "safe_email=$safe_email" >> $GITHUB_OUTPUT
echo "run_id=$run_id" >> $GITHUB_OUTPUT
- name: Run Job Scraper
run: |
python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}" "${{ steps.vars.outputs.run_id }}"
- name: Upload Output Artifact
uses: actions/upload-artifact@v4
with:
name: jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }}
path: outputs/jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }}.csv

View File

@@ -1,48 +0,0 @@
name: JobSpy Scraper Workflow
on:
workflow_dispatch: # Allows manual trigger from GitHub or Power Automate
# Remove or comment out the schedule to prevent auto-runs
# schedule:
# - cron: '0 */6 * * *' # Runs every 6 hours (DISABLED)
permissions:
actions: read
contents: read
id-token: write
jobs:
scrape_jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run JobSpy Scraper
run: python job_scraper_exact_match.py
- name: Debug - Check if jobspy_output.csv exists
run: |
if [ ! -f jobspy_output.csv ]; then
echo "❌ ERROR: jobspy_output.csv not found!"
exit 1
else
echo "✅ jobspy_output.csv found, proceeding to upload..."
fi
- name: Upload JobSpy Output as Artifact
uses: actions/upload-artifact@v4 # Explicitly using latest version
with:
name: jobspy-results
path: jobspy_output.csv

View File

@@ -4,7 +4,7 @@
## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
- Aggregates the job postings in a dataframe
- Proxies support to bypass blocking
@@ -25,7 +25,7 @@ import csv
from jobspy import scrape_jobs
jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
search_term="software engineer",
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA",
@@ -51,6 +51,7 @@ linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale
linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rains mission is to create the fastest and ea...
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme...
```
### Parameters for `scrape_jobs()`
@@ -220,6 +221,7 @@ JobPost
│ ├── country
│ ├── city
│ ├── state
├── is_remote
├── description
├── job_type: fulltime, parttime, internship, contract
├── job_function
@@ -229,8 +231,7 @@ JobPost
│ ├── currency
│ └── salary_source: direct_data, description (parsed from posting)
├── date_posted
── emails
└── is_remote
── emails
Linkedin specific
└── job_level
@@ -245,4 +246,12 @@ Indeed specific
├── company_revenue_label
├── company_description
└── company_logo
Naukri specific
├── skills
├── experience_range
├── company_rating
├── company_reviews_count
├── vacancy_count
└── work_from_home_type
```

View File

@@ -1,8 +0,0 @@
{
"search_terms": ["IT Support", "Help Desk"],
"results_wanted": 50,
"max_days_old": 7,
"target_state": "NY",
"user_email": "Branden@autoemployme.onmicrosoft.com"
}

View File

@@ -1 +0,0 @@
{"search_terms":["False tester"],"results_wanted":"100\n","max_days_old":"1\n","target_state":"NY","user_email":"Branden@autoemployme.onmicrosoft.com"}

View File

@@ -1,116 +0,0 @@
import csv
import datetime
from jobspy.google import Google
from jobspy.linkedin import LinkedIn
from jobspy.indeed import Indeed
from jobspy.ziprecruiter import ZipRecruiter
from jobspy.model import ScraperInput
# Define job sources
sources = {
"google": Google,
"linkedin": LinkedIn,
"indeed": Indeed,
"zip_recruiter": ZipRecruiter,
}
# Define search preferences
search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist"]
results_wanted = 200 # Fetch more jobs
max_days_old = 2 # Fetch jobs posted in last 48 hours
target_state = "NY" # Only keep jobs from New York
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
"""Scrape jobs from multiple sources and filter by state."""
all_jobs = []
today = datetime.date.today()
print("\n🔎 DEBUG: Fetching jobs for search terms:", search_terms)
for search_term in search_terms:
for source_name, source_class in sources.items():
print(f"\n🚀 Scraping {search_term} from {source_name}...")
scraper = source_class()
search_criteria = ScraperInput(
site_type=[source_name],
search_term=search_term,
results_wanted=results_wanted,
)
job_response = scraper.scrape(search_criteria)
for job in job_response.jobs:
# Normalize location fields
location_city = job.location.city.strip() if job.location.city else "Unknown"
location_state = job.location.state.strip().upper() if job.location.state else "Unknown"
location_country = str(job.location.country) if job.location.country else "Unknown"
# Debug: Show all jobs being fetched
print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")
# Ensure the job is recent
if job.date_posted and (today - job.date_posted).days <= max_days_old:
if location_state == target_state or job.is_remote:
print(f"✅ MATCH (In NY or Remote): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
all_jobs.append({
"Job ID": job.id,
"Job Title (Primary)": job.title,
"Company Name": job.company_name if job.company_name else "Unknown",
"Industry": job.company_industry if job.company_industry else "Not Provided",
"Experience Level": job.job_level if job.job_level else "Not Provided",
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
"Is Remote": job.is_remote,
"Currency": job.compensation.currency if job.compensation else "",
"Salary Min": job.compensation.min_amount if job.compensation else "",
"Salary Max": job.compensation.max_amount if job.compensation else "",
"Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided",
"Location City": location_city,
"Location State": location_state,
"Location Country": location_country,
"Job URL": job.job_url,
"Job Description": job.description[:500] if job.description else "No description available",
"Job Source": source_name
})
else:
print(f"❌ Ignored (Wrong State): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
else:
print(f"⏳ Ignored (Too Old): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
print(f"\n{len(all_jobs)} jobs retrieved in NY")
return all_jobs
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
"""Save job data to a CSV file."""
if not jobs:
print("⚠️ No jobs found matching criteria.")
return
fieldnames = [
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
"Experience Level", "Job Type", "Is Remote", "Currency",
"Salary Min", "Salary Max", "Date Posted", "Location City",
"Location State", "Location Country", "Job URL", "Job Description",
"Job Source"
]
with open(filename, mode="w", newline="", encoding="utf-8") as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(jobs)
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
# Run the scraper with multiple job searches
job_data = scrape_jobs(
search_terms=search_terms,
results_wanted=results_wanted,
max_days_old=max_days_old,
target_state=target_state
)
# Save results to CSV
save_jobs_to_csv(job_data)

View File

@@ -1,94 +0,0 @@
import csv, datetime, os, sys, json
from jobspy.google import Google
from jobspy.linkedin import LinkedIn
from jobspy.indeed import Indeed
from jobspy.model import ScraperInput
sources = {
"google": Google,
"linkedin": LinkedIn,
"indeed": Indeed,
}
def sanitize_email(email):
return email.replace("@", "_at_").replace(".", "_")
def load_config(email):
safe_email = sanitize_email(email)
config_path = os.path.join("configs", f"config_{safe_email}.json")
if not os.path.exists(config_path):
raise FileNotFoundError(f"❌ Config for {email} not found at {config_path}")
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f), safe_email
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
today = datetime.date.today()
all_jobs = []
for term in search_terms:
for source, Scraper in sources.items():
print(f"🔍 Scraping {term} from {source}")
scraper = Scraper()
try:
jobs = scraper.scrape(ScraperInput(
site_type=[source],
search_term=term,
results_wanted=results_wanted
)).jobs
except Exception as e:
print(f"⚠️ {source} error: {e}")
continue
for job in jobs:
if job.date_posted and (today - job.date_posted).days <= max_days_old:
if target_state == (job.location.state or "").upper() or job.is_remote:
if any(term.lower() in job.title.lower() for term in search_terms):
all_jobs.append({
"Job ID": job.id,
"Job Title (Primary)": job.title,
"Company Name": job.company_name or "Unknown",
"Industry": job.company_industry or "Not Provided",
"Experience Level": job.job_level or "Not Provided",
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
"Is Remote": job.is_remote,
"Currency": job.compensation.currency if job.compensation else "",
"Salary Min": job.compensation.min_amount if job.compensation else "",
"Salary Max": job.compensation.max_amount if job.compensation else "",
"Date Posted": job.date_posted.strftime("%Y-%m-%d"),
"Location City": job.location.city or "Unknown",
"Location State": (job.location.state or "Unknown").upper(),
"Location Country": job.location.country or "Unknown",
"Job URL": job.job_url,
"Job Description": job.description.replace(",", "") if job.description else "No description",
"Job Source": source
})
print(f"✅ Found {len(all_jobs)} jobs")
return all_jobs
def save_to_csv(jobs, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
fieldnames = [
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
"Experience Level", "Job Type", "Is Remote", "Currency",
"Salary Min", "Salary Max", "Date Posted", "Location City",
"Location State", "Location Country", "Job URL", "Job Description", "Job Source"
]
header = "|~|".join(fieldnames)
rows = [header] + ["|~|".join(str(job.get(col, "Not Provided")).replace(",", "").strip() for col in fieldnames) for job in jobs]
with open(path, "w", encoding="utf-8") as f:
f.write(",".join(rows))
print(f"💾 Saved output to: {path}")
if __name__ == "__main__":
try:
if len(sys.argv) != 3:
raise ValueError("❌ Usage: python job_scraper_dynamic.py <user_email> <run_id>")
user_email, run_id = sys.argv[1], sys.argv[2]
config, safe_email = load_config(user_email)
jobs = scrape_jobs(config["search_terms"], config["results_wanted"], config["max_days_old"], config["target_state"])
save_to_csv(jobs, f"outputs/jobspy_output_{safe_email}_{run_id}.csv")
except Exception as e:
print(f"❌ Fatal error: {e}")
sys.exit(1)

View File

@@ -1,146 +0,0 @@
import csv
import datetime
import os
from jobspy.google import Google
from jobspy.linkedin import LinkedIn
from jobspy.indeed import Indeed
from jobspy.model import ScraperInput
# Define job sources
sources = {
"google": Google,
"linkedin": LinkedIn,
"indeed": Indeed,
}
# Define search preferences
search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "CRM", "Project Manager", "POS", "Microsoft Power", "IT Support"]
results_wanted = 100 # Fetch more jobs
max_days_old = 2 # Fetch jobs posted in last 48 hours
target_state = "NY" # Only keep jobs from New York
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
"""Scrape jobs from multiple sources and filter by state."""
all_jobs = []
today = datetime.date.today()
print("\n🔎 DEBUG: Fetching jobs for search terms:", search_terms)
for search_term in search_terms:
for source_name, source_class in sources.items():
print(f"\n🚀 Scraping {search_term} from {source_name}...")
scraper = source_class()
search_criteria = ScraperInput(
site_type=[source_name],
search_term=search_term,
results_wanted=results_wanted,
)
job_response = scraper.scrape(search_criteria)
for job in job_response.jobs:
# Normalize location fields
location_city = job.location.city.strip() if job.location.city else "Unknown"
location_state = job.location.state.strip().upper() if job.location.state else "Unknown"
location_country = str(job.location.country) if job.location.country else "Unknown"
# Debug: Show all jobs being fetched
print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")
# Exclude jobs that dont explicitly match the search terms
if not any(term.lower() in job.title.lower() for term in search_terms):
print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
continue # Skip this job
# Ensure the job is recent
if job.date_posted and (today - job.date_posted).days <= max_days_old:
# Only accept jobs if they're in NY or Remote
if location_state == target_state or job.is_remote:
print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
all_jobs.append({
"Job ID": job.id,
"Job Title (Primary)": job.title,
"Company Name": job.company_name if job.company_name else "Unknown",
"Industry": job.company_industry if job.company_industry else "Not Provided",
"Experience Level": job.job_level if job.job_level else "Not Provided",
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
"Is Remote": job.is_remote,
"Currency": job.compensation.currency if job.compensation else "",
"Salary Min": job.compensation.min_amount if job.compensation else "",
"Salary Max": job.compensation.max_amount if job.compensation else "",
"Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided",
"Location City": location_city,
"Location State": location_state,
"Location Country": location_country,
"Job URL": job.job_url,
"Job Description": job.description.replace(",", "") if job.description else "No description available",
"Job Source": source_name
})
else:
print(f"❌ Ignored (Wrong State): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
else:
print(f"⏳ Ignored (Too Old): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
print(f"\n{len(all_jobs)} jobs retrieved in NY")
return all_jobs
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
"""Save job data to a CSV file with custom formatting:
- Fields within a record are separated by the custom delimiter |~|
- Records are separated by a comma
- All commas in field values are removed
- Blank fields are replaced with 'Not Provided'
"""
if not jobs:
print("⚠️ No jobs found matching criteria.")
return
# Remove old CSV file before writing
if os.path.exists(filename):
os.remove(filename)
fieldnames = [
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
"Experience Level", "Job Type", "Is Remote", "Currency",
"Salary Min", "Salary Max", "Date Posted", "Location City",
"Location State", "Location Country", "Job URL", "Job Description",
"Job Source"
]
# Build header record using custom field delimiter
header_record = "|~|".join(fieldnames)
records = [header_record]
for job in jobs:
row = []
for field in fieldnames:
value = str(job.get(field, "")).strip()
if not value:
value = "Not Provided"
# Remove all commas from the value
value = value.replace(",", "")
row.append(value)
# Join fields with the custom delimiter
record = "|~|".join(row)
records.append(record)
# Join records with a comma as the record separator
output = ",".join(records)
with open(filename, "w", encoding="utf-8") as file:
file.write(output)
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
# Run the scraper with multiple job searches
job_data = scrape_jobs(
search_terms=search_terms,
results_wanted=results_wanted,
max_days_old=max_days_old,
target_state=target_state
)
# Save results to CSV with custom formatting
save_jobs_to_csv(job_data)

View File

@@ -10,6 +10,7 @@ from jobspy.glassdoor import Glassdoor
from jobspy.google import Google
from jobspy.indeed import Indeed
from jobspy.linkedin import LinkedIn
from jobspy.naukri import Naukri
from jobspy.model import JobType, Location, JobResponse, Country
from jobspy.model import SalarySource, ScraperInput, Site
from jobspy.util import (
@@ -57,6 +58,7 @@ def scrape_jobs(
Site.GLASSDOOR: Glassdoor,
Site.GOOGLE: Google,
Site.BAYT: BaytScraper,
Site.NAUKRI: Naukri,
}
set_logger_level(verbose)
job_type = get_enum_from_value(job_type) if job_type else None
@@ -139,6 +141,7 @@ def scrape_jobs(
**job_data["location"]
).display_location()
# Handle compensation
compensation_obj = job_data.get("compensation")
if compensation_obj and isinstance(compensation_obj, dict):
job_data["interval"] = (
@@ -157,7 +160,6 @@ def scrape_jobs(
and job_data["max_amount"]
):
convert_to_annual(job_data)
else:
if country_enum == Country.USA:
(
@@ -176,6 +178,17 @@ def scrape_jobs(
if "min_amount" in job_data and job_data["min_amount"]
else None
)
#naukri-specific fields
job_data["skills"] = (
", ".join(job_data["skills"]) if job_data["skills"] else None
)
job_data["experience_range"] = job_data.get("experience_range")
job_data["company_rating"] = job_data.get("company_rating")
job_data["company_reviews_count"] = job_data.get("company_reviews_count")
job_data["vacancy_count"] = job_data.get("vacancy_count")
job_data["work_from_home_type"] = job_data.get("work_from_home_type")
job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df)
@@ -199,4 +212,4 @@ def scrape_jobs(
by=["site", "date_posted"], ascending=[True, False]
).reset_index(drop=True)
else:
return pd.DataFrame()
return pd.DataFrame()

View File

@@ -34,3 +34,7 @@ class GoogleJobsException(Exception):
class BaytException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Bayt")
class NaukriException(Exception):
def __init__(self,message=None):
super().__init__(message or "An error occurred with Naukri")

View File

@@ -205,8 +205,6 @@ class Indeed(Scraper):
description = job["description"]["html"]
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
description = description.replace(",", "")
job_type = get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000

View File

@@ -20,7 +20,7 @@ def get_job_type(attributes: list) -> list[JobType]:
def get_compensation(compensation: dict) -> Compensation | None:
"""
Parses the job to get compensation
:param sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrompensation:
:param compensation:
:return: compensation object
"""
if not compensation["baseSalary"] and not compensation["estimated"]:
@@ -58,11 +58,14 @@ def is_job_remote(job: dict, description: str) -> bool:
any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
)
is_remote_in_location = any(
keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords
)
return is_remote_in_attributes or is_remote_in_location
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
def get_compensation_interval(interval: str) -> CompensationInterval:

View File

@@ -14,10 +14,11 @@ from bs4.element import Tag
from jobspy.exception import LinkedInException
from jobspy.linkedin.constant import headers
from jobspy.linkedin.util import (
is_job_remote,
job_type_code,
parse_job_type,
parse_job_level,
parse_company_industry,
parse_company_industry
)
from jobspy.model import (
JobPost,
@@ -173,7 +174,7 @@ class LinkedIn(Scraper):
) -> Optional[JobPost]:
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
compensation = None
compensation = description = None
if salary_tag:
salary_text = salary_tag.get_text(separator=" ").strip()
salary_values = [currency_parser(value) for value in salary_text.split("-")]
@@ -217,8 +218,8 @@ class LinkedIn(Scraper):
job_details = {}
if full_descr:
job_details = self._get_job_details(job_id)
description = description.replace(",", "")
description = job_details.get("description")
is_remote = is_job_remote(title, description, location)
return JobPost(
id=f"li-{job_id}",
@@ -226,6 +227,7 @@ class LinkedIn(Scraper):
company_name=company,
company_url=company_url,
location=location,
is_remote=is_remote,
date_posted=date_posted,
job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation,
@@ -234,7 +236,7 @@ class LinkedIn(Scraper):
company_industry=job_details.get("company_industry"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
emails=extract_emails_from_text(description),
company_logo=job_details.get("company_logo"),
job_function=job_details.get("job_function"),
)

View File

@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from jobspy.model import JobType
from jobspy.model import JobType, Location
from jobspy.util import get_enum_from_job_type
@@ -83,3 +83,14 @@ def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
industry = industry_span.get_text(strip=True)
return industry
def is_job_remote(title: dict, description: str, location: Location) -> bool:
"""
Searches the title, location, and description to check if job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
location = location.display_location()
full_string = f'{title} {description} {location}'.lower()
is_remote = any(keyword in full_string for keyword in remote_keywords)
return is_remote

View File

@@ -254,13 +254,13 @@ class JobPost(BaseModel):
is_remote: bool | None = None
listing_type: str | None = None
# linkedin specific
# LinkedIn specific
job_level: str | None = None
# linkedin and indeed specific
# LinkedIn and Indeed specific
company_industry: str | None = None
# indeed specific
# Indeed specific
company_addresses: str | None = None
company_num_employees: str | None = None
company_revenue: str | None = None
@@ -268,9 +268,16 @@ class JobPost(BaseModel):
company_logo: str | None = None
banner_photo_url: str | None = None
# linkedin only atm
# LinkedIn only atm
job_function: str | None = None
# Naukri specific
skills: list[str] | None = None #from tagsAndSkills
experience_range: str | None = None #from experienceText
company_rating: float | None = None #from ambitionBoxData.AggregateRating
company_reviews_count: int | None = None #from ambitionBoxData.ReviewsCount
vacancy_count: int | None = None #from vacancy
work_from_home_type: str | None = None #from clusters.wfhType (e.g., "Hybrid", "Remote")
class JobResponse(BaseModel):
jobs: list[JobPost] = []
@@ -283,6 +290,7 @@ class Site(Enum):
GLASSDOOR = "glassdoor"
GOOGLE = "google"
BAYT = "bayt"
NAUKRI = "naukri"
class SalarySource(Enum):

301
jobspy/naukri/__init__.py Normal file
View File

@@ -0,0 +1,301 @@
from __future__ import annotations
import math
import random
import time
from datetime import datetime, date
from typing import Optional
import regex as re
import requests
from jobspy.exception import NaukriException
from jobspy.naukri.constant import headers as naukri_headers
from jobspy.naukri.util import (
is_job_remote,
parse_job_type,
parse_company_industry,
)
from jobspy.model import (
JobPost,
Location,
JobResponse,
Country,
Compensation,
DescriptionFormat,
Scraper,
ScraperInput,
Site,
)
from jobspy.util import (
extract_emails_from_text,
currency_parser,
markdown_converter,
create_session,
create_logger,
)
log = create_logger("Naukri")
class Naukri(Scraper):
base_url = "https://www.naukri.com/jobapi/v3/search"
delay = 3
band_delay = 4
jobs_per_page = 20
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes NaukriScraper with the Naukri API URL
"""
super().__init__(Site.NAUKRI, proxies=proxies, ca_cert=ca_cert)
self.session = create_session(
proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
clear_cookies=True,
)
self.session.headers.update(naukri_headers)
self.scraper_input = None
self.country = "India" #naukri is india-focused by default
log.info("Naukri scraper initialized")
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Naukri API for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
start = scraper_input.offset or 0
page = (start // self.jobs_per_page) + 1
request_count = 0
seconds_old = (
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
)
continue_search = (
lambda: len(job_list) < scraper_input.results_wanted and page <= 50 # Arbitrary limit
)
while continue_search():
request_count += 1
log.info(
f"Scraping page {request_count} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)} "
f"for search term: {scraper_input.search_term}"
)
params = {
"noOfResults": self.jobs_per_page,
"urlType": "search_by_keyword",
"searchType": "adv",
"keyword": scraper_input.search_term,
"pageNo": page,
"k": scraper_input.search_term,
"seoKey": f"{scraper_input.search_term.lower().replace(' ', '-')}-jobs",
"src": "jobsearchDesk",
"latLong": "",
"location": scraper_input.location,
"remote": "true" if scraper_input.is_remote else None,
}
if seconds_old:
params["days"] = seconds_old // 86400 # Convert to days
params = {k: v for k, v in params.items() if v is not None}
try:
log.debug(f"Sending request to {self.base_url} with params: {params}")
response = self.session.get(self.base_url, params=params, timeout=10)
if response.status_code not in range(200, 400):
err = f"Naukri API response status code {response.status_code} - {response.text}"
log.error(err)
return JobResponse(jobs=job_list)
data = response.json()
job_details = data.get("jobDetails", [])
log.info(f"Received {len(job_details)} job entries from API")
if not job_details:
log.warning("No job details found in API response")
break
except Exception as e:
log.error(f"Naukri API request failed: {str(e)}")
return JobResponse(jobs=job_list)
for job in job_details:
job_id = job.get("jobId")
if not job_id or job_id in seen_ids:
continue
seen_ids.add(job_id)
log.debug(f"Processing job ID: {job_id}")
try:
fetch_desc = scraper_input.linkedin_fetch_description
job_post = self._process_job(job, job_id, fetch_desc)
if job_post:
job_list.append(job_post)
log.info(f"Added job: {job_post.title} (ID: {job_id})")
if not continue_search():
break
except Exception as e:
log.error(f"Error processing job ID {job_id}: {str(e)}")
raise NaukriException(str(e))
if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += 1
job_list = job_list[:scraper_input.results_wanted]
log.info(f"Scraping completed. Total jobs collected: {len(job_list)}")
return JobResponse(jobs=job_list)
def _process_job(
self, job: dict, job_id: str, full_descr: bool
) -> Optional[JobPost]:
"""
Processes a single job from API response into a JobPost object
"""
title = job.get("title", "N/A")
company = job.get("companyName", "N/A")
company_url = f"https://www.naukri.com/{job.get('staticUrl', '')}" if job.get("staticUrl") else None
location = self._get_location(job.get("placeholders", []))
compensation = self._get_compensation(job.get("placeholders", []))
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
description = job.get("jobDescription") if full_descr else None
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
job_type = parse_job_type(description) if description else None
company_industry = parse_company_industry(description) if description else None
is_remote = is_job_remote(title, description or "", location)
company_logo = job.get("logoPathV3") or job.get("logoPath")
# Naukri-specific fields
skills = job.get("tagsAndSkills", "").split(",") if job.get("tagsAndSkills") else None
experience_range = job.get("experienceText")
ambition_box = job.get("ambitionBoxData", {})
company_rating = float(ambition_box.get("AggregateRating")) if ambition_box.get("AggregateRating") else None
company_reviews_count = ambition_box.get("ReviewsCount")
vacancy_count = job.get("vacancy")
work_from_home_type = self._infer_work_from_home_type(job.get("placeholders", []), title, description or "")
job_post = JobPost(
id=f"nk-{job_id}",
title=title,
company_name=company,
company_url=company_url,
location=location,
is_remote=is_remote,
date_posted=date_posted,
job_url=job_url,
compensation=compensation,
job_type=job_type,
company_industry=company_industry,
description=description,
emails=extract_emails_from_text(description or ""),
company_logo=company_logo,
skills=skills,
experience_range=experience_range,
company_rating=company_rating,
company_reviews_count=company_reviews_count,
vacancy_count=vacancy_count,
work_from_home_type=work_from_home_type,
)
log.debug(f"Processed job: {title} at {company}")
return job_post
def _get_location(self, placeholders: list[dict]) -> Location:
"""
Extracts location data from placeholders
"""
location = Location(country=Country.INDIA)
for placeholder in placeholders:
if placeholder.get("type") == "location":
location_str = placeholder.get("label", "")
parts = location_str.split(", ")
city = parts[0] if parts else None
state = parts[1] if len(parts) > 1 else None
location = Location(city=city, state=state, country=Country.INDIA)
log.debug(f"Parsed location: {location.display_location()}")
break
return location
def _get_compensation(self, placeholders: list[dict]) -> Optional[Compensation]:
"""
Extracts compensation data from placeholders, handling Indian salary formats (Lakhs, Crores)
"""
for placeholder in placeholders:
if placeholder.get("type") == "salary":
salary_text = placeholder.get("label", "").strip()
if salary_text == "Not disclosed":
log.debug("Salary not disclosed")
return None
# Handle Indian salary formats (e.g., "12-16 Lacs P.A.", "1-5 Cr")
salary_match = re.match(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*(Lacs|Lakh|Cr)\s*(P\.A\.)?", salary_text, re.IGNORECASE)
if salary_match:
min_salary, max_salary, unit = salary_match.groups()[:3]
min_salary, max_salary = float(min_salary), float(max_salary)
currency = "INR"
# Convert to base units (INR)
if unit.lower() in ("lacs", "lakh"):
min_salary *= 100000 # 1 Lakh = 100,000 INR
max_salary *= 100000
elif unit.lower() == "cr":
min_salary *= 10000000 # 1 Crore = 10,000,000 INR
max_salary *= 10000000
log.debug(f"Parsed salary: {min_salary} - {max_salary} INR")
return Compensation(
min_amount=int(min_salary),
max_amount=int(max_salary),
currency=currency,
)
else:
log.debug(f"Could not parse salary: {salary_text}")
return None
return None
def _parse_date(self, label: str, created_date: int) -> Optional[date]:
"""
Parses date from footerPlaceholderLabel or createdDate, returning a date object
"""
today = datetime.now()
if not label:
if created_date:
return datetime.fromtimestamp(created_date / 1000).date() # Convert to date
return None
label = label.lower()
if "today" in label or "just now" in label or "few hours" in label:
log.debug("Date parsed as today")
return today.date()
elif "ago" in label:
match = re.search(r"(\d+)\s*day", label)
if match:
days = int(match.group(1))
parsed_date = today.replace(day=today.day - days).date()
log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
return parsed_date
elif created_date:
parsed_date = datetime.fromtimestamp(created_date / 1000).date()
log.debug(f"Date parsed from timestamp: {parsed_date}")
return parsed_date
log.debug("No date parsed")
return None
def _infer_work_from_home_type(self, placeholders: list[dict], title: str, description: str) -> Optional[str]:
"""
Infers work-from-home type from job data (e.g., 'Hybrid', 'Remote', 'Work from office')
"""
location_str = next((p["label"] for p in placeholders if p["type"] == "location"), "").lower()
if "hybrid" in location_str or "hybrid" in title.lower() or "hybrid" in description.lower():
return "Hybrid"
elif "remote" in location_str or "remote" in title.lower() or "remote" in description.lower():
return "Remote"
elif "work from office" in description.lower() or not ("remote" in description.lower() or "hybrid" in description.lower()):
return "Work from office"
return None

11
jobspy/naukri/constant.py Normal file
View File

@@ -0,0 +1,11 @@
headers = {
"authority": "www.naukri.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"appid": "109",
"systemid": "Naukri",
"Nkparam": "Ppy0YK9uSHqPtG3bEejYc04RTpUN2CjJOrqA68tzQt0SKJHXZKzz9M8cZtKLVkoOuQmfe4cTb1r2CwfHaxW5Tg==",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

34
jobspy/naukri/util.py Normal file
View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from bs4 import BeautifulSoup
from jobspy.model import JobType, Location
from jobspy.util import get_enum_from_job_type
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from the job page
"""
job_type_tag = soup.find("span", class_="job-type")
if job_type_tag:
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
return [get_enum_from_job_type(job_type_str)] if job_type_str else None
return None
def parse_company_industry(soup: BeautifulSoup) -> str | None:
"""
Gets the company industry from the job page
"""
industry_tag = soup.find("span", class_="industry")
return industry_tag.get_text(strip=True) if industry_tag else None
def is_job_remote(title: str, description: str, location: Location) -> bool:
"""
Searches the title, description, and location to check if the job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
location_str = location.display_location()
full_string = f"{title} {description} {location_str}".lower()
return any(keyword in full_string for keyword in remote_keywords)

View File

@@ -344,4 +344,11 @@ desired_order = [
"company_num_employees",
"company_revenue",
"company_description",
#naukri-specific fields
"skills",
"experience_range",
"company_rating",
"company_reviews_count",
"vacancy_count",
"work_from_home_type",
]

View File

@@ -216,4 +216,4 @@ class ZipRecruiter(Scraper):
Sends a session event to the API with device properties.
"""
url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=get_cookie_data)
self.session.post(url, data=get_cookie_data)

View File

@@ -28,4 +28,4 @@ def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
return None

File diff suppressed because it is too large Load Diff

236
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -4,12 +4,12 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "python-jobspy"
version = "1.1.78"
version = "1.1.79"
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
homepage = "https://github.com/cullenwatson/JobSpy"
readme = "README.md"
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt"]
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt", "naukri"]
[[tool.poetry.packages]]
include = "jobspy"
@@ -17,7 +17,7 @@ include = "jobspy"
line-length = 88
[tool.poetry.dependencies]
python = "^3.10 || ^3.12"
python = "^3.10"
requests = "^2.31.0"
beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"

View File

@@ -1,118 +0,0 @@
annotated-types==0.7.0
anyio==4.6.2.post1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==24.2.0
babel==2.16.0
beautifulsoup4==4.12.3
black==24.10.0
bleach==6.1.0
certifi==2024.8.30
cffi==1.17.1
cfgv==3.4.0
charset-normalizer==3.4.0
click==8.1.7
comm==0.2.2
debugpy==1.8.7
decorator==5.1.1
defusedxml==0.7.1
distlib==0.3.9
executing==2.1.0
fastjsonschema==2.20.0
filelock==3.16.1
fqdn==1.5.1
h11==0.14.0
httpcore==1.0.6
httpx==0.27.2
identify==2.6.1
idna==3.10
ipykernel==6.29.5
ipython==8.28.0
ipywidgets==8.1.5
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.4
json5==0.9.25
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
jupyter==1.1.1
jupyter-console==6.6.3
jupyter-events==0.10.0
jupyter-lsp==2.2.5
jupyter_client==8.6.3
jupyter_core==5.7.2
jupyter_server==2.14.2
jupyter_server_terminals==0.5.3
jupyterlab==4.2.5
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
jupyterlab_widgets==3.0.13
markdownify==0.13.1
MarkupSafe==3.0.2
matplotlib-inline==0.1.7
mistune==3.0.2
mypy-extensions==1.0.0
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
nodeenv==1.9.1
notebook==7.2.2
notebook_shim==0.2.4
numpy==1.26.3
overrides==7.7.0
packaging==24.1
pandas==2.2.3
pandocfilters==1.5.1
parso==0.8.4
pathspec==0.12.1
pexpect==4.9.0
platformdirs==4.3.6
pre_commit==4.0.1
prometheus_client==0.21.0
prompt_toolkit==3.0.48
psutil==6.1.0
ptyprocess==0.7.0
pure_eval==0.2.3
pycparser==2.22
pydantic==2.9.2
pydantic_core==2.23.4
Pygments==2.18.0
python-dateutil==2.9.0.post0
-e git+https://github.com/fakebranden/JobSpy@60819a8fcabbd3eaba7741b673023612dc3d3692#egg=python_jobspy
python-json-logger==2.0.7
pytz==2024.2
PyYAML==6.0.2
pyzmq==26.2.0
referencing==0.35.1
regex==2024.9.11
requests==2.32.3
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rpds-py==0.20.0
Send2Trash==1.8.3
setuptools==75.2.0
six==1.16.0
sniffio==1.3.1
soupsieve==2.6
stack-data==0.6.3
terminado==0.18.1
tinycss2==1.3.0
tls-client==1.0.1
tornado==6.4.1
traitlets==5.14.3
types-python-dateutil==2.9.0.20241003
typing_extensions==4.12.2
tzdata==2024.2
uri-template==1.3.0
urllib3==2.2.3
virtualenv==20.27.0
wcwidth==0.2.13
webcolors==24.8.0
webencodings==0.5.1
websocket-client==1.8.0
widgetsnbextension==4.0.13