mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
Compare commits
45 Commits
v1.1.79
...
6d1cc5c592
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6d1cc5c592 | ||
|
|
d57901da66 | ||
|
|
5146f47d5b | ||
|
|
77cc1f8550 | ||
|
|
84b4524c43 | ||
|
|
e6ae23c76f | ||
|
|
0103e11234 | ||
|
|
697ae5c8c9 | ||
|
|
9e0674f7fc | ||
|
|
bbdad3584e | ||
|
|
a045bb442a | ||
|
|
3eb4c122e7 | ||
|
|
74877c5fd8 | ||
|
|
0a475e312f | ||
|
|
e0514d218e | ||
|
|
529aa8a1f4 | ||
|
|
93a21941eb | ||
|
|
8f8b39c6e2 | ||
|
|
cdcd79edfe | ||
|
|
89a40dc3e3 | ||
|
|
6a326b7dd4 | ||
|
|
0a5c5fa9b3 | ||
|
|
e22e4cc092 | ||
|
|
0abe28fae4 | ||
|
|
31d0389dd8 | ||
|
|
fb9ab3a315 | ||
|
|
c34eff610f | ||
|
|
e9160a0b4c | ||
|
|
cd916c7978 | ||
|
|
25c084ca2c | ||
|
|
341deba465 | ||
|
|
5337b3ec7f | ||
|
|
0171ecc4a0 | ||
|
|
e191405c8e | ||
|
|
a2d139cb96 | ||
|
|
9e41e6e9db | ||
|
|
bb7d4c55ed | ||
|
|
58cc1937bb | ||
|
|
60819a8fca | ||
|
|
1c59cd6738 | ||
|
|
eed96e4c04 | ||
|
|
83c64f4bca | ||
|
|
d8ad9da1c0 | ||
|
|
5f5738eaaa | ||
|
|
e1da326317 |
49
.github/workflows/job_scraper_dynamic.yml
vendored
Normal file
49
.github/workflows/job_scraper_dynamic.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
name: JobSpy Scraper Dynamic Workflow
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
user_email:
|
||||
description: 'Email of user'
|
||||
required: true
|
||||
default: 'Branden@autoemployme.onmicrosoft.com'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
jobs:
|
||||
scrape_jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set Up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Sanitize Email + Create Run ID
|
||||
id: vars
|
||||
run: |
|
||||
safe_email=$(echo "${{ github.event.inputs.user_email }}" | sed 's/@/_at_/g; s/\./_/g')
|
||||
run_id=$(date +%s)
|
||||
echo "safe_email=$safe_email" >> $GITHUB_OUTPUT
|
||||
echo "run_id=$run_id" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Run Job Scraper
|
||||
run: |
|
||||
python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}" "${{ steps.vars.outputs.run_id }}"
|
||||
|
||||
- name: Upload Output Artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }}
|
||||
path: outputs/jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }}.csv
|
||||
48
.github/workflows/jobspy_scraper.yml
vendored
Normal file
48
.github/workflows/jobspy_scraper.yml
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
name: JobSpy Scraper Workflow
|
||||
|
||||
on:
|
||||
workflow_dispatch: # Allows manual trigger from GitHub or Power Automate
|
||||
# Remove or comment out the schedule to prevent auto-runs
|
||||
# schedule:
|
||||
# - cron: '0 */6 * * *' # Runs every 6 hours (DISABLED)
|
||||
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
jobs:
|
||||
scrape_jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Run JobSpy Scraper
|
||||
run: python job_scraper_exact_match.py
|
||||
|
||||
- name: Debug - Check if jobspy_output.csv exists
|
||||
run: |
|
||||
if [ ! -f jobspy_output.csv ]; then
|
||||
echo "❌ ERROR: jobspy_output.csv not found!"
|
||||
exit 1
|
||||
else
|
||||
echo "✅ jobspy_output.csv found, proceeding to upload..."
|
||||
fi
|
||||
|
||||
- name: Upload JobSpy Output as Artifact
|
||||
uses: actions/upload-artifact@v4 # Explicitly using latest version
|
||||
with:
|
||||
name: jobspy-results
|
||||
path: jobspy_output.csv
|
||||
17
README.md
17
README.md
@@ -4,7 +4,7 @@
|
||||
|
||||
## Features
|
||||
|
||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
|
||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
|
||||
- Aggregates the job postings in a dataframe
|
||||
- Proxies support to bypass blocking
|
||||
|
||||
@@ -25,7 +25,7 @@ import csv
|
||||
from jobspy import scrape_jobs
|
||||
|
||||
jobs = scrape_jobs(
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
|
||||
search_term="software engineer",
|
||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||
location="San Francisco, CA",
|
||||
@@ -51,7 +51,6 @@ linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale
|
||||
linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rain’s mission is to create the fastest and ea...
|
||||
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
|
||||
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme...
|
||||
|
||||
```
|
||||
|
||||
### Parameters for `scrape_jobs()`
|
||||
@@ -221,7 +220,6 @@ JobPost
|
||||
│ ├── country
|
||||
│ ├── city
|
||||
│ ├── state
|
||||
├── is_remote
|
||||
├── description
|
||||
├── job_type: fulltime, parttime, internship, contract
|
||||
├── job_function
|
||||
@@ -231,7 +229,8 @@ JobPost
|
||||
│ ├── currency
|
||||
│ └── salary_source: direct_data, description (parsed from posting)
|
||||
├── date_posted
|
||||
└── emails
|
||||
├── emails
|
||||
└── is_remote
|
||||
|
||||
Linkedin specific
|
||||
└── job_level
|
||||
@@ -246,12 +245,4 @@ Indeed specific
|
||||
├── company_revenue_label
|
||||
├── company_description
|
||||
└── company_logo
|
||||
|
||||
Naukri specific
|
||||
├── skills
|
||||
├── experience_range
|
||||
├── company_rating
|
||||
├── company_reviews_count
|
||||
├── vacancy_count
|
||||
└── work_from_home_type
|
||||
```
|
||||
|
||||
8
configs/config.json
Normal file
8
configs/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"search_terms": ["IT Support", "Help Desk"],
|
||||
"results_wanted": 50,
|
||||
"max_days_old": 7,
|
||||
"target_state": "NY",
|
||||
"user_email": "Branden@autoemployme.onmicrosoft.com"
|
||||
}
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
{"search_terms":["False tester"],"results_wanted":"100\n","max_days_old":"1\n","target_state":"NY","user_email":"Branden@autoemployme.onmicrosoft.com"}
|
||||
116
job_scraper.py
Normal file
116
job_scraper.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import csv
|
||||
import datetime
|
||||
from jobspy.google import Google
|
||||
from jobspy.linkedin import LinkedIn
|
||||
from jobspy.indeed import Indeed
|
||||
from jobspy.ziprecruiter import ZipRecruiter
|
||||
from jobspy.model import ScraperInput
|
||||
|
||||
# Define job sources
|
||||
sources = {
|
||||
"google": Google,
|
||||
"linkedin": LinkedIn,
|
||||
"indeed": Indeed,
|
||||
"zip_recruiter": ZipRecruiter,
|
||||
}
|
||||
|
||||
# Define search preferences
|
||||
search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist"]
|
||||
results_wanted = 200 # Fetch more jobs
|
||||
max_days_old = 2 # Fetch jobs posted in last 48 hours
|
||||
target_state = "NY" # Only keep jobs from New York
|
||||
|
||||
|
||||
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||
"""Scrape jobs from multiple sources and filter by state."""
|
||||
all_jobs = []
|
||||
today = datetime.date.today()
|
||||
print("\n🔎 DEBUG: Fetching jobs for search terms:", search_terms)
|
||||
|
||||
for search_term in search_terms:
|
||||
for source_name, source_class in sources.items():
|
||||
print(f"\n🚀 Scraping {search_term} from {source_name}...")
|
||||
|
||||
scraper = source_class()
|
||||
search_criteria = ScraperInput(
|
||||
site_type=[source_name],
|
||||
search_term=search_term,
|
||||
results_wanted=results_wanted,
|
||||
)
|
||||
|
||||
job_response = scraper.scrape(search_criteria)
|
||||
|
||||
for job in job_response.jobs:
|
||||
# Normalize location fields
|
||||
location_city = job.location.city.strip() if job.location.city else "Unknown"
|
||||
location_state = job.location.state.strip().upper() if job.location.state else "Unknown"
|
||||
location_country = str(job.location.country) if job.location.country else "Unknown"
|
||||
|
||||
# Debug: Show all jobs being fetched
|
||||
print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")
|
||||
|
||||
# Ensure the job is recent
|
||||
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
||||
if location_state == target_state or job.is_remote:
|
||||
print(f"✅ MATCH (In NY or Remote): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||
|
||||
all_jobs.append({
|
||||
"Job ID": job.id,
|
||||
"Job Title (Primary)": job.title,
|
||||
"Company Name": job.company_name if job.company_name else "Unknown",
|
||||
"Industry": job.company_industry if job.company_industry else "Not Provided",
|
||||
"Experience Level": job.job_level if job.job_level else "Not Provided",
|
||||
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
|
||||
"Is Remote": job.is_remote,
|
||||
"Currency": job.compensation.currency if job.compensation else "",
|
||||
"Salary Min": job.compensation.min_amount if job.compensation else "",
|
||||
"Salary Max": job.compensation.max_amount if job.compensation else "",
|
||||
"Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided",
|
||||
"Location City": location_city,
|
||||
"Location State": location_state,
|
||||
"Location Country": location_country,
|
||||
"Job URL": job.job_url,
|
||||
"Job Description": job.description[:500] if job.description else "No description available",
|
||||
"Job Source": source_name
|
||||
})
|
||||
else:
|
||||
print(f"❌ Ignored (Wrong State): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||
else:
|
||||
print(f"⏳ Ignored (Too Old): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||
|
||||
print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
|
||||
return all_jobs
|
||||
|
||||
|
||||
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
||||
"""Save job data to a CSV file."""
|
||||
if not jobs:
|
||||
print("⚠️ No jobs found matching criteria.")
|
||||
return
|
||||
|
||||
fieldnames = [
|
||||
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
|
||||
"Experience Level", "Job Type", "Is Remote", "Currency",
|
||||
"Salary Min", "Salary Max", "Date Posted", "Location City",
|
||||
"Location State", "Location Country", "Job URL", "Job Description",
|
||||
"Job Source"
|
||||
]
|
||||
|
||||
with open(filename, mode="w", newline="", encoding="utf-8") as file:
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(jobs)
|
||||
|
||||
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
|
||||
|
||||
|
||||
# Run the scraper with multiple job searches
|
||||
job_data = scrape_jobs(
|
||||
search_terms=search_terms,
|
||||
results_wanted=results_wanted,
|
||||
max_days_old=max_days_old,
|
||||
target_state=target_state
|
||||
)
|
||||
|
||||
# Save results to CSV
|
||||
save_jobs_to_csv(job_data)
|
||||
94
job_scraper_dynamic.py
Normal file
94
job_scraper_dynamic.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import csv, datetime, os, sys, json
|
||||
from jobspy.google import Google
|
||||
from jobspy.linkedin import LinkedIn
|
||||
from jobspy.indeed import Indeed
|
||||
from jobspy.model import ScraperInput
|
||||
|
||||
sources = {
|
||||
"google": Google,
|
||||
"linkedin": LinkedIn,
|
||||
"indeed": Indeed,
|
||||
}
|
||||
|
||||
def sanitize_email(email):
|
||||
return email.replace("@", "_at_").replace(".", "_")
|
||||
|
||||
def load_config(email):
|
||||
safe_email = sanitize_email(email)
|
||||
config_path = os.path.join("configs", f"config_{safe_email}.json")
|
||||
if not os.path.exists(config_path):
|
||||
raise FileNotFoundError(f"❌ Config for {email} not found at {config_path}")
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f), safe_email
|
||||
|
||||
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||
today = datetime.date.today()
|
||||
all_jobs = []
|
||||
|
||||
for term in search_terms:
|
||||
for source, Scraper in sources.items():
|
||||
print(f"🔍 Scraping {term} from {source}")
|
||||
scraper = Scraper()
|
||||
try:
|
||||
jobs = scraper.scrape(ScraperInput(
|
||||
site_type=[source],
|
||||
search_term=term,
|
||||
results_wanted=results_wanted
|
||||
)).jobs
|
||||
except Exception as e:
|
||||
print(f"⚠️ {source} error: {e}")
|
||||
continue
|
||||
|
||||
for job in jobs:
|
||||
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
||||
if target_state == (job.location.state or "").upper() or job.is_remote:
|
||||
if any(term.lower() in job.title.lower() for term in search_terms):
|
||||
all_jobs.append({
|
||||
"Job ID": job.id,
|
||||
"Job Title (Primary)": job.title,
|
||||
"Company Name": job.company_name or "Unknown",
|
||||
"Industry": job.company_industry or "Not Provided",
|
||||
"Experience Level": job.job_level or "Not Provided",
|
||||
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
|
||||
"Is Remote": job.is_remote,
|
||||
"Currency": job.compensation.currency if job.compensation else "",
|
||||
"Salary Min": job.compensation.min_amount if job.compensation else "",
|
||||
"Salary Max": job.compensation.max_amount if job.compensation else "",
|
||||
"Date Posted": job.date_posted.strftime("%Y-%m-%d"),
|
||||
"Location City": job.location.city or "Unknown",
|
||||
"Location State": (job.location.state or "Unknown").upper(),
|
||||
"Location Country": job.location.country or "Unknown",
|
||||
"Job URL": job.job_url,
|
||||
"Job Description": job.description.replace(",", "") if job.description else "No description",
|
||||
"Job Source": source
|
||||
})
|
||||
print(f"✅ Found {len(all_jobs)} jobs")
|
||||
return all_jobs
|
||||
|
||||
def save_to_csv(jobs, path):
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
fieldnames = [
|
||||
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
|
||||
"Experience Level", "Job Type", "Is Remote", "Currency",
|
||||
"Salary Min", "Salary Max", "Date Posted", "Location City",
|
||||
"Location State", "Location Country", "Job URL", "Job Description", "Job Source"
|
||||
]
|
||||
header = "|~|".join(fieldnames)
|
||||
rows = [header] + ["|~|".join(str(job.get(col, "Not Provided")).replace(",", "").strip() for col in fieldnames) for job in jobs]
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(",".join(rows))
|
||||
print(f"💾 Saved output to: {path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
if len(sys.argv) != 3:
|
||||
raise ValueError("❌ Usage: python job_scraper_dynamic.py <user_email> <run_id>")
|
||||
|
||||
user_email, run_id = sys.argv[1], sys.argv[2]
|
||||
config, safe_email = load_config(user_email)
|
||||
jobs = scrape_jobs(config["search_terms"], config["results_wanted"], config["max_days_old"], config["target_state"])
|
||||
save_to_csv(jobs, f"outputs/jobspy_output_{safe_email}_{run_id}.csv")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Fatal error: {e}")
|
||||
sys.exit(1)
|
||||
146
job_scraper_exact_match.py
Normal file
146
job_scraper_exact_match.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import csv
|
||||
import datetime
|
||||
import os
|
||||
from jobspy.google import Google
|
||||
from jobspy.linkedin import LinkedIn
|
||||
from jobspy.indeed import Indeed
|
||||
from jobspy.model import ScraperInput
|
||||
|
||||
# Define job sources
|
||||
sources = {
|
||||
"google": Google,
|
||||
"linkedin": LinkedIn,
|
||||
"indeed": Indeed,
|
||||
}
|
||||
|
||||
# Define search preferences
|
||||
search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "CRM", "Project Manager", "POS", "Microsoft Power", "IT Support"]
|
||||
results_wanted = 100 # Fetch more jobs
|
||||
max_days_old = 2 # Fetch jobs posted in last 48 hours
|
||||
target_state = "NY" # Only keep jobs from New York
|
||||
|
||||
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||
"""Scrape jobs from multiple sources and filter by state."""
|
||||
all_jobs = []
|
||||
today = datetime.date.today()
|
||||
|
||||
print("\n🔎 DEBUG: Fetching jobs for search terms:", search_terms)
|
||||
|
||||
for search_term in search_terms:
|
||||
for source_name, source_class in sources.items():
|
||||
print(f"\n🚀 Scraping {search_term} from {source_name}...")
|
||||
|
||||
scraper = source_class()
|
||||
search_criteria = ScraperInput(
|
||||
site_type=[source_name],
|
||||
search_term=search_term,
|
||||
results_wanted=results_wanted,
|
||||
)
|
||||
|
||||
job_response = scraper.scrape(search_criteria)
|
||||
|
||||
for job in job_response.jobs:
|
||||
# Normalize location fields
|
||||
location_city = job.location.city.strip() if job.location.city else "Unknown"
|
||||
location_state = job.location.state.strip().upper() if job.location.state else "Unknown"
|
||||
location_country = str(job.location.country) if job.location.country else "Unknown"
|
||||
|
||||
# Debug: Show all jobs being fetched
|
||||
print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")
|
||||
|
||||
# Exclude jobs that don’t explicitly match the search terms
|
||||
if not any(term.lower() in job.title.lower() for term in search_terms):
|
||||
print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
|
||||
continue # Skip this job
|
||||
|
||||
# Ensure the job is recent
|
||||
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
||||
# Only accept jobs if they're in NY or Remote
|
||||
if location_state == target_state or job.is_remote:
|
||||
print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||
all_jobs.append({
|
||||
"Job ID": job.id,
|
||||
"Job Title (Primary)": job.title,
|
||||
"Company Name": job.company_name if job.company_name else "Unknown",
|
||||
"Industry": job.company_industry if job.company_industry else "Not Provided",
|
||||
"Experience Level": job.job_level if job.job_level else "Not Provided",
|
||||
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
|
||||
"Is Remote": job.is_remote,
|
||||
"Currency": job.compensation.currency if job.compensation else "",
|
||||
"Salary Min": job.compensation.min_amount if job.compensation else "",
|
||||
"Salary Max": job.compensation.max_amount if job.compensation else "",
|
||||
"Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided",
|
||||
"Location City": location_city,
|
||||
"Location State": location_state,
|
||||
"Location Country": location_country,
|
||||
"Job URL": job.job_url,
|
||||
"Job Description": job.description.replace(",", "") if job.description else "No description available",
|
||||
"Job Source": source_name
|
||||
})
|
||||
else:
|
||||
print(f"❌ Ignored (Wrong State): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||
else:
|
||||
print(f"⏳ Ignored (Too Old): {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||
|
||||
print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
|
||||
return all_jobs
|
||||
|
||||
|
||||
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
||||
"""Save job data to a CSV file with custom formatting:
|
||||
- Fields within a record are separated by the custom delimiter |~|
|
||||
- Records are separated by a comma
|
||||
- All commas in field values are removed
|
||||
- Blank fields are replaced with 'Not Provided'
|
||||
"""
|
||||
if not jobs:
|
||||
print("⚠️ No jobs found matching criteria.")
|
||||
return
|
||||
|
||||
# Remove old CSV file before writing
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
|
||||
fieldnames = [
|
||||
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
|
||||
"Experience Level", "Job Type", "Is Remote", "Currency",
|
||||
"Salary Min", "Salary Max", "Date Posted", "Location City",
|
||||
"Location State", "Location Country", "Job URL", "Job Description",
|
||||
"Job Source"
|
||||
]
|
||||
|
||||
# Build header record using custom field delimiter
|
||||
header_record = "|~|".join(fieldnames)
|
||||
records = [header_record]
|
||||
|
||||
for job in jobs:
|
||||
row = []
|
||||
for field in fieldnames:
|
||||
value = str(job.get(field, "")).strip()
|
||||
if not value:
|
||||
value = "Not Provided"
|
||||
# Remove all commas from the value
|
||||
value = value.replace(",", "")
|
||||
row.append(value)
|
||||
# Join fields with the custom delimiter
|
||||
record = "|~|".join(row)
|
||||
records.append(record)
|
||||
|
||||
# Join records with a comma as the record separator
|
||||
output = ",".join(records)
|
||||
with open(filename, "w", encoding="utf-8") as file:
|
||||
file.write(output)
|
||||
|
||||
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
|
||||
|
||||
|
||||
# Run the scraper with multiple job searches
|
||||
job_data = scrape_jobs(
|
||||
search_terms=search_terms,
|
||||
results_wanted=results_wanted,
|
||||
max_days_old=max_days_old,
|
||||
target_state=target_state
|
||||
)
|
||||
|
||||
# Save results to CSV with custom formatting
|
||||
save_jobs_to_csv(job_data)
|
||||
@@ -10,7 +10,6 @@ from jobspy.glassdoor import Glassdoor
|
||||
from jobspy.google import Google
|
||||
from jobspy.indeed import Indeed
|
||||
from jobspy.linkedin import LinkedIn
|
||||
from jobspy.naukri import Naukri
|
||||
from jobspy.model import JobType, Location, JobResponse, Country
|
||||
from jobspy.model import SalarySource, ScraperInput, Site
|
||||
from jobspy.util import (
|
||||
@@ -58,7 +57,6 @@ def scrape_jobs(
|
||||
Site.GLASSDOOR: Glassdoor,
|
||||
Site.GOOGLE: Google,
|
||||
Site.BAYT: BaytScraper,
|
||||
Site.NAUKRI: Naukri,
|
||||
}
|
||||
set_logger_level(verbose)
|
||||
job_type = get_enum_from_value(job_type) if job_type else None
|
||||
@@ -141,7 +139,6 @@ def scrape_jobs(
|
||||
**job_data["location"]
|
||||
).display_location()
|
||||
|
||||
# Handle compensation
|
||||
compensation_obj = job_data.get("compensation")
|
||||
if compensation_obj and isinstance(compensation_obj, dict):
|
||||
job_data["interval"] = (
|
||||
@@ -160,6 +157,7 @@ def scrape_jobs(
|
||||
and job_data["max_amount"]
|
||||
):
|
||||
convert_to_annual(job_data)
|
||||
|
||||
else:
|
||||
if country_enum == Country.USA:
|
||||
(
|
||||
@@ -178,17 +176,6 @@ def scrape_jobs(
|
||||
if "min_amount" in job_data and job_data["min_amount"]
|
||||
else None
|
||||
)
|
||||
|
||||
#naukri-specific fields
|
||||
job_data["skills"] = (
|
||||
", ".join(job_data["skills"]) if job_data["skills"] else None
|
||||
)
|
||||
job_data["experience_range"] = job_data.get("experience_range")
|
||||
job_data["company_rating"] = job_data.get("company_rating")
|
||||
job_data["company_reviews_count"] = job_data.get("company_reviews_count")
|
||||
job_data["vacancy_count"] = job_data.get("vacancy_count")
|
||||
job_data["work_from_home_type"] = job_data.get("work_from_home_type")
|
||||
|
||||
job_df = pd.DataFrame([job_data])
|
||||
jobs_dfs.append(job_df)
|
||||
|
||||
@@ -212,4 +199,4 @@ def scrape_jobs(
|
||||
by=["site", "date_posted"], ascending=[True, False]
|
||||
).reset_index(drop=True)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
return pd.DataFrame()
|
||||
|
||||
@@ -34,7 +34,3 @@ class GoogleJobsException(Exception):
|
||||
class BaytException(Exception):
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with Bayt")
|
||||
|
||||
class NaukriException(Exception):
|
||||
def __init__(self,message=None):
|
||||
super().__init__(message or "An error occurred with Naukri")
|
||||
@@ -205,6 +205,8 @@ class Indeed(Scraper):
|
||||
description = job["description"]["html"]
|
||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
description = description.replace(",", "")
|
||||
|
||||
|
||||
job_type = get_job_type(job["attributes"])
|
||||
timestamp_seconds = job["datePublished"] / 1000
|
||||
|
||||
@@ -20,7 +20,7 @@ def get_job_type(attributes: list) -> list[JobType]:
|
||||
def get_compensation(compensation: dict) -> Compensation | None:
|
||||
"""
|
||||
Parses the job to get compensation
|
||||
:param compensation:
|
||||
:param sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrompensation:
|
||||
:return: compensation object
|
||||
"""
|
||||
if not compensation["baseSalary"] and not compensation["estimated"]:
|
||||
@@ -58,14 +58,11 @@ def is_job_remote(job: dict, description: str) -> bool:
|
||||
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
||||
for attr in job["attributes"]
|
||||
)
|
||||
is_remote_in_description = any(
|
||||
keyword in description.lower() for keyword in remote_keywords
|
||||
)
|
||||
is_remote_in_location = any(
|
||||
keyword in job["location"]["formatted"]["long"].lower()
|
||||
for keyword in remote_keywords
|
||||
)
|
||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||
return is_remote_in_attributes or is_remote_in_location
|
||||
|
||||
|
||||
def get_compensation_interval(interval: str) -> CompensationInterval:
|
||||
|
||||
@@ -14,11 +14,10 @@ from bs4.element import Tag
|
||||
from jobspy.exception import LinkedInException
|
||||
from jobspy.linkedin.constant import headers
|
||||
from jobspy.linkedin.util import (
|
||||
is_job_remote,
|
||||
job_type_code,
|
||||
parse_job_type,
|
||||
parse_job_level,
|
||||
parse_company_industry
|
||||
parse_company_industry,
|
||||
)
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
@@ -174,7 +173,7 @@ class LinkedIn(Scraper):
|
||||
) -> Optional[JobPost]:
|
||||
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
||||
|
||||
compensation = description = None
|
||||
compensation = None
|
||||
if salary_tag:
|
||||
salary_text = salary_tag.get_text(separator=" ").strip()
|
||||
salary_values = [currency_parser(value) for value in salary_text.split("-")]
|
||||
@@ -218,8 +217,8 @@ class LinkedIn(Scraper):
|
||||
job_details = {}
|
||||
if full_descr:
|
||||
job_details = self._get_job_details(job_id)
|
||||
description = job_details.get("description")
|
||||
is_remote = is_job_remote(title, description, location)
|
||||
description = description.replace(",", "")
|
||||
|
||||
|
||||
return JobPost(
|
||||
id=f"li-{job_id}",
|
||||
@@ -227,7 +226,6 @@ class LinkedIn(Scraper):
|
||||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
is_remote=is_remote,
|
||||
date_posted=date_posted,
|
||||
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
||||
compensation=compensation,
|
||||
@@ -236,7 +234,7 @@ class LinkedIn(Scraper):
|
||||
company_industry=job_details.get("company_industry"),
|
||||
description=job_details.get("description"),
|
||||
job_url_direct=job_details.get("job_url_direct"),
|
||||
emails=extract_emails_from_text(description),
|
||||
emails=extract_emails_from_text(job_details.get("description")),
|
||||
company_logo=job_details.get("company_logo"),
|
||||
job_function=job_details.get("job_function"),
|
||||
)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from jobspy.model import JobType, Location
|
||||
from jobspy.model import JobType
|
||||
from jobspy.util import get_enum_from_job_type
|
||||
|
||||
|
||||
@@ -83,14 +83,3 @@ def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
||||
industry = industry_span.get_text(strip=True)
|
||||
|
||||
return industry
|
||||
|
||||
|
||||
def is_job_remote(title: dict, description: str, location: Location) -> bool:
|
||||
"""
|
||||
Searches the title, location, and description to check if job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
location = location.display_location()
|
||||
full_string = f'{title} {description} {location}'.lower()
|
||||
is_remote = any(keyword in full_string for keyword in remote_keywords)
|
||||
return is_remote
|
||||
|
||||
@@ -254,13 +254,13 @@ class JobPost(BaseModel):
|
||||
is_remote: bool | None = None
|
||||
listing_type: str | None = None
|
||||
|
||||
# LinkedIn specific
|
||||
# linkedin specific
|
||||
job_level: str | None = None
|
||||
|
||||
# LinkedIn and Indeed specific
|
||||
# linkedin and indeed specific
|
||||
company_industry: str | None = None
|
||||
|
||||
# Indeed specific
|
||||
# indeed specific
|
||||
company_addresses: str | None = None
|
||||
company_num_employees: str | None = None
|
||||
company_revenue: str | None = None
|
||||
@@ -268,16 +268,9 @@ class JobPost(BaseModel):
|
||||
company_logo: str | None = None
|
||||
banner_photo_url: str | None = None
|
||||
|
||||
# LinkedIn only atm
|
||||
# linkedin only atm
|
||||
job_function: str | None = None
|
||||
|
||||
# Naukri specific
|
||||
skills: list[str] | None = None #from tagsAndSkills
|
||||
experience_range: str | None = None #from experienceText
|
||||
company_rating: float | None = None #from ambitionBoxData.AggregateRating
|
||||
company_reviews_count: int | None = None #from ambitionBoxData.ReviewsCount
|
||||
vacancy_count: int | None = None #from vacancy
|
||||
work_from_home_type: str | None = None #from clusters.wfhType (e.g., "Hybrid", "Remote")
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
jobs: list[JobPost] = []
|
||||
@@ -290,7 +283,6 @@ class Site(Enum):
|
||||
GLASSDOOR = "glassdoor"
|
||||
GOOGLE = "google"
|
||||
BAYT = "bayt"
|
||||
NAUKRI = "naukri"
|
||||
|
||||
|
||||
class SalarySource(Enum):
|
||||
|
||||
@@ -1,301 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import random
|
||||
import time
|
||||
from datetime import datetime, date
|
||||
from typing import Optional
|
||||
|
||||
import regex as re
|
||||
import requests
|
||||
|
||||
from jobspy.exception import NaukriException
|
||||
from jobspy.naukri.constant import headers as naukri_headers
|
||||
from jobspy.naukri.util import (
|
||||
is_job_remote,
|
||||
parse_job_type,
|
||||
parse_company_industry,
|
||||
)
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
Country,
|
||||
Compensation,
|
||||
DescriptionFormat,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
)
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
currency_parser,
|
||||
markdown_converter,
|
||||
create_session,
|
||||
create_logger,
|
||||
)
|
||||
|
||||
log = create_logger("Naukri")
|
||||
|
||||
class Naukri(Scraper):
|
||||
base_url = "https://www.naukri.com/jobapi/v3/search"
|
||||
delay = 3
|
||||
band_delay = 4
|
||||
jobs_per_page = 20
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes NaukriScraper with the Naukri API URL
|
||||
"""
|
||||
super().__init__(Site.NAUKRI, proxies=proxies, ca_cert=ca_cert)
|
||||
self.session = create_session(
|
||||
proxies=self.proxies,
|
||||
ca_cert=ca_cert,
|
||||
is_tls=False,
|
||||
has_retry=True,
|
||||
delay=5,
|
||||
clear_cookies=True,
|
||||
)
|
||||
self.session.headers.update(naukri_headers)
|
||||
self.scraper_input = None
|
||||
self.country = "India" #naukri is india-focused by default
|
||||
log.info("Naukri scraper initialized")
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes Naukri API for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
:return: job_response
|
||||
"""
|
||||
self.scraper_input = scraper_input
|
||||
job_list: list[JobPost] = []
|
||||
seen_ids = set()
|
||||
start = scraper_input.offset or 0
|
||||
page = (start // self.jobs_per_page) + 1
|
||||
request_count = 0
|
||||
seconds_old = (
|
||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||
)
|
||||
continue_search = (
|
||||
lambda: len(job_list) < scraper_input.results_wanted and page <= 50 # Arbitrary limit
|
||||
)
|
||||
|
||||
while continue_search():
|
||||
request_count += 1
|
||||
log.info(
|
||||
f"Scraping page {request_count} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)} "
|
||||
f"for search term: {scraper_input.search_term}"
|
||||
)
|
||||
params = {
|
||||
"noOfResults": self.jobs_per_page,
|
||||
"urlType": "search_by_keyword",
|
||||
"searchType": "adv",
|
||||
"keyword": scraper_input.search_term,
|
||||
"pageNo": page,
|
||||
"k": scraper_input.search_term,
|
||||
"seoKey": f"{scraper_input.search_term.lower().replace(' ', '-')}-jobs",
|
||||
"src": "jobsearchDesk",
|
||||
"latLong": "",
|
||||
"location": scraper_input.location,
|
||||
"remote": "true" if scraper_input.is_remote else None,
|
||||
}
|
||||
if seconds_old:
|
||||
params["days"] = seconds_old // 86400 # Convert to days
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
try:
|
||||
log.debug(f"Sending request to {self.base_url} with params: {params}")
|
||||
response = self.session.get(self.base_url, params=params, timeout=10)
|
||||
if response.status_code not in range(200, 400):
|
||||
err = f"Naukri API response status code {response.status_code} - {response.text}"
|
||||
log.error(err)
|
||||
return JobResponse(jobs=job_list)
|
||||
data = response.json()
|
||||
job_details = data.get("jobDetails", [])
|
||||
log.info(f"Received {len(job_details)} job entries from API")
|
||||
if not job_details:
|
||||
log.warning("No job details found in API response")
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Naukri API request failed: {str(e)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
for job in job_details:
|
||||
job_id = job.get("jobId")
|
||||
if not job_id or job_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(job_id)
|
||||
log.debug(f"Processing job ID: {job_id}")
|
||||
|
||||
try:
|
||||
fetch_desc = scraper_input.linkedin_fetch_description
|
||||
job_post = self._process_job(job, job_id, fetch_desc)
|
||||
if job_post:
|
||||
job_list.append(job_post)
|
||||
log.info(f"Added job: {job_post.title} (ID: {job_id})")
|
||||
if not continue_search():
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Error processing job ID {job_id}: {str(e)}")
|
||||
raise NaukriException(str(e))
|
||||
|
||||
if continue_search():
|
||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||
page += 1
|
||||
|
||||
job_list = job_list[:scraper_input.results_wanted]
|
||||
log.info(f"Scraping completed. Total jobs collected: {len(job_list)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _process_job(
|
||||
self, job: dict, job_id: str, full_descr: bool
|
||||
) -> Optional[JobPost]:
|
||||
"""
|
||||
Processes a single job from API response into a JobPost object
|
||||
"""
|
||||
title = job.get("title", "N/A")
|
||||
company = job.get("companyName", "N/A")
|
||||
company_url = f"https://www.naukri.com/{job.get('staticUrl', '')}" if job.get("staticUrl") else None
|
||||
|
||||
location = self._get_location(job.get("placeholders", []))
|
||||
compensation = self._get_compensation(job.get("placeholders", []))
|
||||
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
|
||||
|
||||
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
|
||||
description = job.get("jobDescription") if full_descr else None
|
||||
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
|
||||
job_type = parse_job_type(description) if description else None
|
||||
company_industry = parse_company_industry(description) if description else None
|
||||
is_remote = is_job_remote(title, description or "", location)
|
||||
company_logo = job.get("logoPathV3") or job.get("logoPath")
|
||||
|
||||
# Naukri-specific fields
|
||||
skills = job.get("tagsAndSkills", "").split(",") if job.get("tagsAndSkills") else None
|
||||
experience_range = job.get("experienceText")
|
||||
ambition_box = job.get("ambitionBoxData", {})
|
||||
company_rating = float(ambition_box.get("AggregateRating")) if ambition_box.get("AggregateRating") else None
|
||||
company_reviews_count = ambition_box.get("ReviewsCount")
|
||||
vacancy_count = job.get("vacancy")
|
||||
work_from_home_type = self._infer_work_from_home_type(job.get("placeholders", []), title, description or "")
|
||||
|
||||
job_post = JobPost(
|
||||
id=f"nk-{job_id}",
|
||||
title=title,
|
||||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
is_remote=is_remote,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
compensation=compensation,
|
||||
job_type=job_type,
|
||||
company_industry=company_industry,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description or ""),
|
||||
company_logo=company_logo,
|
||||
skills=skills,
|
||||
experience_range=experience_range,
|
||||
company_rating=company_rating,
|
||||
company_reviews_count=company_reviews_count,
|
||||
vacancy_count=vacancy_count,
|
||||
work_from_home_type=work_from_home_type,
|
||||
)
|
||||
log.debug(f"Processed job: {title} at {company}")
|
||||
return job_post
|
||||
|
||||
def _get_location(self, placeholders: list[dict]) -> Location:
|
||||
"""
|
||||
Extracts location data from placeholders
|
||||
"""
|
||||
location = Location(country=Country.INDIA)
|
||||
for placeholder in placeholders:
|
||||
if placeholder.get("type") == "location":
|
||||
location_str = placeholder.get("label", "")
|
||||
parts = location_str.split(", ")
|
||||
city = parts[0] if parts else None
|
||||
state = parts[1] if len(parts) > 1 else None
|
||||
location = Location(city=city, state=state, country=Country.INDIA)
|
||||
log.debug(f"Parsed location: {location.display_location()}")
|
||||
break
|
||||
return location
|
||||
|
||||
def _get_compensation(self, placeholders: list[dict]) -> Optional[Compensation]:
|
||||
"""
|
||||
Extracts compensation data from placeholders, handling Indian salary formats (Lakhs, Crores)
|
||||
"""
|
||||
for placeholder in placeholders:
|
||||
if placeholder.get("type") == "salary":
|
||||
salary_text = placeholder.get("label", "").strip()
|
||||
if salary_text == "Not disclosed":
|
||||
log.debug("Salary not disclosed")
|
||||
return None
|
||||
|
||||
# Handle Indian salary formats (e.g., "12-16 Lacs P.A.", "1-5 Cr")
|
||||
salary_match = re.match(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*(Lacs|Lakh|Cr)\s*(P\.A\.)?", salary_text, re.IGNORECASE)
|
||||
if salary_match:
|
||||
min_salary, max_salary, unit = salary_match.groups()[:3]
|
||||
min_salary, max_salary = float(min_salary), float(max_salary)
|
||||
currency = "INR"
|
||||
|
||||
# Convert to base units (INR)
|
||||
if unit.lower() in ("lacs", "lakh"):
|
||||
min_salary *= 100000 # 1 Lakh = 100,000 INR
|
||||
max_salary *= 100000
|
||||
elif unit.lower() == "cr":
|
||||
min_salary *= 10000000 # 1 Crore = 10,000,000 INR
|
||||
max_salary *= 10000000
|
||||
|
||||
log.debug(f"Parsed salary: {min_salary} - {max_salary} INR")
|
||||
return Compensation(
|
||||
min_amount=int(min_salary),
|
||||
max_amount=int(max_salary),
|
||||
currency=currency,
|
||||
)
|
||||
else:
|
||||
log.debug(f"Could not parse salary: {salary_text}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _parse_date(self, label: str, created_date: int) -> Optional[date]:
|
||||
"""
|
||||
Parses date from footerPlaceholderLabel or createdDate, returning a date object
|
||||
"""
|
||||
today = datetime.now()
|
||||
if not label:
|
||||
if created_date:
|
||||
return datetime.fromtimestamp(created_date / 1000).date() # Convert to date
|
||||
return None
|
||||
label = label.lower()
|
||||
if "today" in label or "just now" in label or "few hours" in label:
|
||||
log.debug("Date parsed as today")
|
||||
return today.date()
|
||||
elif "ago" in label:
|
||||
match = re.search(r"(\d+)\s*day", label)
|
||||
if match:
|
||||
days = int(match.group(1))
|
||||
parsed_date = today.replace(day=today.day - days).date()
|
||||
log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
|
||||
return parsed_date
|
||||
elif created_date:
|
||||
parsed_date = datetime.fromtimestamp(created_date / 1000).date()
|
||||
log.debug(f"Date parsed from timestamp: {parsed_date}")
|
||||
return parsed_date
|
||||
log.debug("No date parsed")
|
||||
return None
|
||||
|
||||
def _infer_work_from_home_type(self, placeholders: list[dict], title: str, description: str) -> Optional[str]:
|
||||
"""
|
||||
Infers work-from-home type from job data (e.g., 'Hybrid', 'Remote', 'Work from office')
|
||||
"""
|
||||
location_str = next((p["label"] for p in placeholders if p["type"] == "location"), "").lower()
|
||||
if "hybrid" in location_str or "hybrid" in title.lower() or "hybrid" in description.lower():
|
||||
return "Hybrid"
|
||||
elif "remote" in location_str or "remote" in title.lower() or "remote" in description.lower():
|
||||
return "Remote"
|
||||
elif "work from office" in description.lower() or not ("remote" in description.lower() or "hybrid" in description.lower()):
|
||||
return "Work from office"
|
||||
return None
|
||||
@@ -1,11 +0,0 @@
|
||||
headers = {
|
||||
"authority": "www.naukri.com",
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"cache-control": "max-age=0",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"appid": "109",
|
||||
"systemid": "Naukri",
|
||||
"Nkparam": "Ppy0YK9uSHqPtG3bEejYc04RTpUN2CjJOrqA68tzQt0SKJHXZKzz9M8cZtKLVkoOuQmfe4cTb1r2CwfHaxW5Tg==",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from jobspy.model import JobType, Location
|
||||
from jobspy.util import get_enum_from_job_type
|
||||
|
||||
|
||||
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
|
||||
"""
|
||||
Gets the job type from the job page
|
||||
"""
|
||||
job_type_tag = soup.find("span", class_="job-type")
|
||||
if job_type_tag:
|
||||
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
|
||||
return [get_enum_from_job_type(job_type_str)] if job_type_str else None
|
||||
return None
|
||||
|
||||
|
||||
def parse_company_industry(soup: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the company industry from the job page
|
||||
"""
|
||||
industry_tag = soup.find("span", class_="industry")
|
||||
return industry_tag.get_text(strip=True) if industry_tag else None
|
||||
|
||||
|
||||
def is_job_remote(title: str, description: str, location: Location) -> bool:
|
||||
"""
|
||||
Searches the title, description, and location to check if the job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
location_str = location.display_location()
|
||||
full_string = f"{title} {description} {location_str}".lower()
|
||||
return any(keyword in full_string for keyword in remote_keywords)
|
||||
@@ -344,11 +344,4 @@ desired_order = [
|
||||
"company_num_employees",
|
||||
"company_revenue",
|
||||
"company_description",
|
||||
#naukri-specific fields
|
||||
"skills",
|
||||
"experience_range",
|
||||
"company_rating",
|
||||
"company_reviews_count",
|
||||
"vacancy_count",
|
||||
"work_from_home_type",
|
||||
]
|
||||
|
||||
@@ -216,4 +216,4 @@ class ZipRecruiter(Scraper):
|
||||
Sends a session event to the API with device properties.
|
||||
"""
|
||||
url = f"{self.api_url}/jobs-app/event"
|
||||
self.session.post(url, data=get_cookie_data)
|
||||
self.session.post(url, data=get_cookie_data)
|
||||
@@ -28,4 +28,4 @@ def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
||||
return None
|
||||
1159
jobspy_output.csv
Normal file
1159
jobspy_output.csv
Normal file
File diff suppressed because it is too large
Load Diff
236
poetry.lock
generated
236
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,12 +4,12 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.79"
|
||||
version = "1.1.78"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
||||
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||
readme = "README.md"
|
||||
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt", "naukri"]
|
||||
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt"]
|
||||
[[tool.poetry.packages]]
|
||||
include = "jobspy"
|
||||
|
||||
@@ -17,7 +17,7 @@ include = "jobspy"
|
||||
line-length = 88
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
python = "^3.10 || ^3.12"
|
||||
requests = "^2.31.0"
|
||||
beautifulsoup4 = "^4.12.2"
|
||||
pandas = "^2.1.0"
|
||||
|
||||
118
requirements.txt
Normal file
118
requirements.txt
Normal file
@@ -0,0 +1,118 @@
|
||||
annotated-types==0.7.0
|
||||
anyio==4.6.2.post1
|
||||
argon2-cffi==23.1.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
arrow==1.3.0
|
||||
asttokens==2.4.1
|
||||
async-lru==2.0.4
|
||||
attrs==24.2.0
|
||||
babel==2.16.0
|
||||
beautifulsoup4==4.12.3
|
||||
black==24.10.0
|
||||
bleach==6.1.0
|
||||
certifi==2024.8.30
|
||||
cffi==1.17.1
|
||||
cfgv==3.4.0
|
||||
charset-normalizer==3.4.0
|
||||
click==8.1.7
|
||||
comm==0.2.2
|
||||
debugpy==1.8.7
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
distlib==0.3.9
|
||||
executing==2.1.0
|
||||
fastjsonschema==2.20.0
|
||||
filelock==3.16.1
|
||||
fqdn==1.5.1
|
||||
h11==0.14.0
|
||||
httpcore==1.0.6
|
||||
httpx==0.27.2
|
||||
identify==2.6.1
|
||||
idna==3.10
|
||||
ipykernel==6.29.5
|
||||
ipython==8.28.0
|
||||
ipywidgets==8.1.5
|
||||
isoduration==20.11.0
|
||||
jedi==0.19.1
|
||||
Jinja2==3.1.4
|
||||
json5==0.9.25
|
||||
jsonpointer==3.0.0
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
jupyter==1.1.1
|
||||
jupyter-console==6.6.3
|
||||
jupyter-events==0.10.0
|
||||
jupyter-lsp==2.2.5
|
||||
jupyter_client==8.6.3
|
||||
jupyter_core==5.7.2
|
||||
jupyter_server==2.14.2
|
||||
jupyter_server_terminals==0.5.3
|
||||
jupyterlab==4.2.5
|
||||
jupyterlab_pygments==0.3.0
|
||||
jupyterlab_server==2.27.3
|
||||
jupyterlab_widgets==3.0.13
|
||||
markdownify==0.13.1
|
||||
MarkupSafe==3.0.2
|
||||
matplotlib-inline==0.1.7
|
||||
mistune==3.0.2
|
||||
mypy-extensions==1.0.0
|
||||
nbclient==0.10.0
|
||||
nbconvert==7.16.4
|
||||
nbformat==5.10.4
|
||||
nest-asyncio==1.6.0
|
||||
nodeenv==1.9.1
|
||||
notebook==7.2.2
|
||||
notebook_shim==0.2.4
|
||||
numpy==1.26.3
|
||||
overrides==7.7.0
|
||||
packaging==24.1
|
||||
pandas==2.2.3
|
||||
pandocfilters==1.5.1
|
||||
parso==0.8.4
|
||||
pathspec==0.12.1
|
||||
pexpect==4.9.0
|
||||
platformdirs==4.3.6
|
||||
pre_commit==4.0.1
|
||||
prometheus_client==0.21.0
|
||||
prompt_toolkit==3.0.48
|
||||
psutil==6.1.0
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pycparser==2.22
|
||||
pydantic==2.9.2
|
||||
pydantic_core==2.23.4
|
||||
Pygments==2.18.0
|
||||
python-dateutil==2.9.0.post0
|
||||
-e git+https://github.com/fakebranden/JobSpy@60819a8fcabbd3eaba7741b673023612dc3d3692#egg=python_jobspy
|
||||
python-json-logger==2.0.7
|
||||
pytz==2024.2
|
||||
PyYAML==6.0.2
|
||||
pyzmq==26.2.0
|
||||
referencing==0.35.1
|
||||
regex==2024.9.11
|
||||
requests==2.32.3
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
rpds-py==0.20.0
|
||||
Send2Trash==1.8.3
|
||||
setuptools==75.2.0
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.6
|
||||
stack-data==0.6.3
|
||||
terminado==0.18.1
|
||||
tinycss2==1.3.0
|
||||
tls-client==1.0.1
|
||||
tornado==6.4.1
|
||||
traitlets==5.14.3
|
||||
types-python-dateutil==2.9.0.20241003
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.2
|
||||
uri-template==1.3.0
|
||||
urllib3==2.2.3
|
||||
virtualenv==20.27.0
|
||||
wcwidth==0.2.13
|
||||
webcolors==24.8.0
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.8.0
|
||||
widgetsnbextension==4.0.13
|
||||
Reference in New Issue
Block a user