mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 03:54:31 -08:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
abd5878238 | ||
|
|
ae2b1ea42c | ||
|
|
53b3b41385 | ||
|
|
9aae02453d | ||
|
|
94d413bad1 | ||
|
|
61205bcc77 | ||
|
|
f1602eca70 | ||
|
|
d4d52d05f5 | ||
|
|
0946cb3373 | ||
|
|
051981689f | ||
|
|
903b7e6f1b | ||
|
|
6782b9884e | ||
|
|
94c74d60f2 | ||
|
|
5463e5a664 | ||
|
|
ed139e7e6b |
8
.github/workflows/publish-to-pypi.yml
vendored
8
.github/workflows/publish-to-pypi.yml
vendored
@@ -1,5 +1,9 @@
|
||||
name: Publish JobSpy to PyPi
|
||||
on: push
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-n-publish:
|
||||
@@ -27,7 +31,7 @@ jobs:
|
||||
build
|
||||
|
||||
- name: Publish distribution 📦 to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
if: startsWith(github.ref, 'refs/tags') || github.event_name == 'workflow_dispatch'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
23
README.md
23
README.md
@@ -4,7 +4,7 @@
|
||||
|
||||
## Features
|
||||
|
||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
|
||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently
|
||||
- Aggregates the job postings in a dataframe
|
||||
- Proxies support to bypass blocking
|
||||
|
||||
@@ -25,7 +25,7 @@ import csv
|
||||
from jobspy import scrape_jobs
|
||||
|
||||
jobs = scrape_jobs(
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
|
||||
search_term="software engineer",
|
||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||
location="San Francisco, CA",
|
||||
@@ -51,6 +51,7 @@ linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale
|
||||
linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rain’s mission is to create the fastest and ea...
|
||||
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
|
||||
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme...
|
||||
|
||||
```
|
||||
|
||||
### Parameters for `scrape_jobs()`
|
||||
@@ -58,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
||||
```plaintext
|
||||
Optional
|
||||
├── site_name (list|str):
|
||||
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt
|
||||
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs
|
||||
| (default is all)
|
||||
│
|
||||
├── search_term (str)
|
||||
@@ -85,6 +86,10 @@ Optional
|
||||
│
|
||||
├── easy_apply (bool):
|
||||
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
||||
|
|
||||
├── user_agent (str):
|
||||
| override the default user agent which may be outdated
|
||||
|
|
||||
│
|
||||
├── description_format (str):
|
||||
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
||||
@@ -220,6 +225,7 @@ JobPost
|
||||
│ ├── country
|
||||
│ ├── city
|
||||
│ ├── state
|
||||
├── is_remote
|
||||
├── description
|
||||
├── job_type: fulltime, parttime, internship, contract
|
||||
├── job_function
|
||||
@@ -229,8 +235,7 @@ JobPost
|
||||
│ ├── currency
|
||||
│ └── salary_source: direct_data, description (parsed from posting)
|
||||
├── date_posted
|
||||
├── emails
|
||||
└── is_remote
|
||||
└── emails
|
||||
|
||||
Linkedin specific
|
||||
└── job_level
|
||||
@@ -245,4 +250,12 @@ Indeed specific
|
||||
├── company_revenue_label
|
||||
├── company_description
|
||||
└── company_logo
|
||||
|
||||
Naukri specific
|
||||
├── skills
|
||||
├── experience_range
|
||||
├── company_rating
|
||||
├── company_reviews_count
|
||||
├── vacancy_count
|
||||
└── work_from_home_type
|
||||
```
|
||||
|
||||
@@ -6,10 +6,12 @@ from typing import Tuple
|
||||
import pandas as pd
|
||||
|
||||
from jobspy.bayt import BaytScraper
|
||||
from jobspy.bdjobs import BDJobs
|
||||
from jobspy.glassdoor import Glassdoor
|
||||
from jobspy.google import Google
|
||||
from jobspy.indeed import Indeed
|
||||
from jobspy.linkedin import LinkedIn
|
||||
from jobspy.naukri import Naukri
|
||||
from jobspy.model import JobType, Location, JobResponse, Country
|
||||
from jobspy.model import SalarySource, ScraperInput, Site
|
||||
from jobspy.util import (
|
||||
@@ -24,6 +26,8 @@ from jobspy.util import (
|
||||
from jobspy.ziprecruiter import ZipRecruiter
|
||||
|
||||
|
||||
# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
|
||||
|
||||
def scrape_jobs(
|
||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||
search_term: str | None = None,
|
||||
@@ -44,6 +48,7 @@ def scrape_jobs(
|
||||
hours_old: int = None,
|
||||
enforce_annual_salary: bool = False,
|
||||
verbose: int = 0,
|
||||
user_agent: str = None,
|
||||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
@@ -57,6 +62,8 @@ def scrape_jobs(
|
||||
Site.GLASSDOOR: Glassdoor,
|
||||
Site.GOOGLE: Google,
|
||||
Site.BAYT: BaytScraper,
|
||||
Site.NAUKRI: Naukri,
|
||||
Site.BDJOBS: BDJobs, # Add BDJobs to the scraper mapping
|
||||
}
|
||||
set_logger_level(verbose)
|
||||
job_type = get_enum_from_value(job_type) if job_type else None
|
||||
@@ -96,7 +103,7 @@ def scrape_jobs(
|
||||
|
||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||
scraper_class = SCRAPER_MAPPING[site]
|
||||
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
|
||||
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
|
||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||
cap_name = site.value.capitalize()
|
||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||
@@ -139,6 +146,7 @@ def scrape_jobs(
|
||||
**job_data["location"]
|
||||
).display_location()
|
||||
|
||||
# Handle compensation
|
||||
compensation_obj = job_data.get("compensation")
|
||||
if compensation_obj and isinstance(compensation_obj, dict):
|
||||
job_data["interval"] = (
|
||||
@@ -157,7 +165,6 @@ def scrape_jobs(
|
||||
and job_data["max_amount"]
|
||||
):
|
||||
convert_to_annual(job_data)
|
||||
|
||||
else:
|
||||
if country_enum == Country.USA:
|
||||
(
|
||||
@@ -176,6 +183,17 @@ def scrape_jobs(
|
||||
if "min_amount" in job_data and job_data["min_amount"]
|
||||
else None
|
||||
)
|
||||
|
||||
#naukri-specific fields
|
||||
job_data["skills"] = (
|
||||
", ".join(job_data["skills"]) if job_data["skills"] else None
|
||||
)
|
||||
job_data["experience_range"] = job_data.get("experience_range")
|
||||
job_data["company_rating"] = job_data.get("company_rating")
|
||||
job_data["company_reviews_count"] = job_data.get("company_reviews_count")
|
||||
job_data["vacancy_count"] = job_data.get("vacancy_count")
|
||||
job_data["work_from_home_type"] = job_data.get("work_from_home_type")
|
||||
|
||||
job_df = pd.DataFrame([job_data])
|
||||
jobs_dfs.append(job_df)
|
||||
|
||||
@@ -200,3 +218,9 @@ def scrape_jobs(
|
||||
).reset_index(drop=True)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
# Add BDJobs to __all__
|
||||
__all__ = [
|
||||
"BDJobs",
|
||||
]
|
||||
@@ -25,7 +25,7 @@ class BaytScraper(Scraper):
|
||||
band_delay = 3
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
||||
self.scraper_input = None
|
||||
|
||||
353
jobspy/bdjobs/__init__.py
Normal file
353
jobspy/bdjobs/__init__.py
Normal file
@@ -0,0 +1,353 @@
|
||||
# __init__.py
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
|
||||
from jobspy.exception import BDJobsException
|
||||
from jobspy.bdjobs.constant import headers, search_params
|
||||
from jobspy.bdjobs.util import (
|
||||
parse_location,
|
||||
parse_date,
|
||||
find_job_listings,
|
||||
is_job_remote,
|
||||
)
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
Country,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
DescriptionFormat,
|
||||
)
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
create_logger,
|
||||
remove_attributes,
|
||||
markdown_converter,
|
||||
)
|
||||
|
||||
log = create_logger("BDJobs")
|
||||
|
||||
|
||||
class BDJobs(Scraper):
|
||||
base_url = "https://jobs.bdjobs.com"
|
||||
search_url = "https://jobs.bdjobs.com/jobsearch.asp"
|
||||
delay = 2
|
||||
band_delay = 3
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes BDJobsScraper with the BDJobs job search url
|
||||
"""
|
||||
super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
|
||||
self.session = create_session(
|
||||
proxies=self.proxies,
|
||||
ca_cert=ca_cert,
|
||||
is_tls=False,
|
||||
has_retry=True,
|
||||
delay=5,
|
||||
clear_cookies=True,
|
||||
)
|
||||
self.session.headers.update(headers)
|
||||
self.scraper_input = None
|
||||
self.country = "bangladesh"
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes BDJobs for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
:return: job_response
|
||||
"""
|
||||
self.scraper_input = scraper_input
|
||||
job_list: list[JobPost] = []
|
||||
seen_ids = set()
|
||||
page = 1
|
||||
request_count = 0
|
||||
|
||||
# Set up search parameters
|
||||
params = search_params.copy()
|
||||
params["txtsearch"] = scraper_input.search_term
|
||||
|
||||
continue_search = lambda: len(job_list) < scraper_input.results_wanted
|
||||
|
||||
while continue_search():
|
||||
request_count += 1
|
||||
log.info(f"search page: {request_count}")
|
||||
|
||||
try:
|
||||
# Add page parameter if needed
|
||||
if page > 1:
|
||||
params["pg"] = page
|
||||
|
||||
response = self.session.get(
|
||||
self.search_url,
|
||||
params=params,
|
||||
timeout=getattr(scraper_input, "request_timeout", 60),
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
log.error(f"BDJobs response status code {response.status_code}")
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
job_cards = find_job_listings(soup)
|
||||
|
||||
if not job_cards or len(job_cards) == 0:
|
||||
log.info("No more job listings found")
|
||||
break
|
||||
|
||||
log.info(f"Found {len(job_cards)} job cards on page {page}")
|
||||
|
||||
for job_card in job_cards:
|
||||
try:
|
||||
job_post = self._process_job(job_card)
|
||||
if job_post and job_post.id not in seen_ids:
|
||||
seen_ids.add(job_post.id)
|
||||
job_list.append(job_post)
|
||||
|
||||
if not continue_search():
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Error processing job card: {str(e)}")
|
||||
|
||||
page += 1
|
||||
# Add delay between requests
|
||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error during scraping: {str(e)}")
|
||||
break
|
||||
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
|
||||
"""
|
||||
Processes a job card element into a JobPost object
|
||||
:param job_card: Job card element
|
||||
:return: JobPost object
|
||||
"""
|
||||
try:
|
||||
# Extract job ID and URL
|
||||
job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
|
||||
if not job_link:
|
||||
return None
|
||||
|
||||
job_url = job_link.get("href")
|
||||
if not job_url.startswith("http"):
|
||||
job_url = urljoin(self.base_url, job_url)
|
||||
|
||||
# Extract job ID from URL
|
||||
job_id = (
|
||||
job_url.split("jobid=")[-1].split("&")[0]
|
||||
if "jobid=" in job_url
|
||||
else f"bdjobs-{hash(job_url)}"
|
||||
)
|
||||
|
||||
# Extract title
|
||||
title = job_link.get_text(strip=True)
|
||||
if not title:
|
||||
title_elem = job_card.find(
|
||||
["h2", "h3", "h4", "strong", "div"],
|
||||
class_=lambda c: c and "job-title-text" in c,
|
||||
)
|
||||
title = title_elem.get_text(strip=True) if title_elem else "N/A"
|
||||
|
||||
# Extract company name - IMPROVED
|
||||
company_elem = job_card.find(
|
||||
["span", "div"],
|
||||
class_=lambda c: c and "comp-name-text" in (c or "").lower(),
|
||||
)
|
||||
if company_elem:
|
||||
company_name = company_elem.get_text(strip=True)
|
||||
else:
|
||||
# Try alternative selectors
|
||||
company_elem = job_card.find(
|
||||
["span", "div"],
|
||||
class_=lambda c: c
|
||||
and any(
|
||||
term in (c or "").lower()
|
||||
for term in ["company", "org", "comp-name"]
|
||||
),
|
||||
)
|
||||
company_name = (
|
||||
company_elem.get_text(strip=True) if company_elem else "N/A"
|
||||
)
|
||||
|
||||
# Extract location
|
||||
location_elem = job_card.find(
|
||||
["span", "div"],
|
||||
class_=lambda c: c and "locon-text-d" in (c or "").lower(),
|
||||
)
|
||||
if not location_elem:
|
||||
location_elem = job_card.find(
|
||||
["span", "div"],
|
||||
class_=lambda c: c
|
||||
and any(
|
||||
term in (c or "").lower()
|
||||
for term in ["location", "area", "locon"]
|
||||
),
|
||||
)
|
||||
location_text = (
|
||||
location_elem.get_text(strip=True)
|
||||
if location_elem
|
||||
else "Dhaka, Bangladesh"
|
||||
)
|
||||
|
||||
# Create Location object
|
||||
location = parse_location(location_text, self.country)
|
||||
|
||||
# Extract date posted
|
||||
date_elem = job_card.find(
|
||||
["span", "div"],
|
||||
class_=lambda c: c
|
||||
and any(
|
||||
term in (c or "").lower()
|
||||
for term in ["date", "deadline", "published"]
|
||||
),
|
||||
)
|
||||
date_posted = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get_text(strip=True)
|
||||
date_posted = parse_date(date_text)
|
||||
|
||||
# Check if job is remote
|
||||
is_remote = is_job_remote(title, location=location)
|
||||
|
||||
# Create job post object
|
||||
job_post = JobPost(
|
||||
id=job_id,
|
||||
title=title,
|
||||
company_name=company_name, # Use company_name instead of company
|
||||
location=location,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
is_remote=is_remote,
|
||||
site=self.site,
|
||||
)
|
||||
|
||||
# Always fetch description for BDJobs
|
||||
job_details = self._get_job_details(job_url)
|
||||
job_post.description = job_details.get("description", "")
|
||||
job_post.job_type = job_details.get("job_type", "")
|
||||
|
||||
return job_post
|
||||
except Exception as e:
|
||||
log.error(f"Error in _process_job: {str(e)}")
|
||||
return None
|
||||
|
||||
def _get_job_details(self, job_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Gets detailed job information from the job page
|
||||
:param job_url: Job page URL
|
||||
:return: Dictionary with job details
|
||||
"""
|
||||
try:
|
||||
response = self.session.get(job_url, timeout=60)
|
||||
if response.status_code != 200:
|
||||
return {}
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Find job description - IMPROVED based on correct.py
|
||||
description = ""
|
||||
|
||||
# Try to find the job content div first (as in correct.py)
|
||||
job_content_div = soup.find("div", class_="jobcontent")
|
||||
if job_content_div:
|
||||
# Look for responsibilities section
|
||||
responsibilities_heading = job_content_div.find(
|
||||
"h4", id="job_resp"
|
||||
) or job_content_div.find(
|
||||
["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower()
|
||||
)
|
||||
if responsibilities_heading:
|
||||
responsibilities_elements = []
|
||||
# Find all following elements until the next heading or hr
|
||||
for sibling in responsibilities_heading.find_next_siblings():
|
||||
if sibling.name in ["hr", "h4", "h5"]:
|
||||
break
|
||||
if sibling.name == "ul":
|
||||
responsibilities_elements.extend(
|
||||
li.get_text(separator=" ", strip=True)
|
||||
for li in sibling.find_all("li")
|
||||
)
|
||||
elif sibling.name == "p":
|
||||
responsibilities_elements.append(
|
||||
sibling.get_text(separator=" ", strip=True)
|
||||
)
|
||||
|
||||
description = (
|
||||
"\n".join(responsibilities_elements)
|
||||
if responsibilities_elements
|
||||
else ""
|
||||
)
|
||||
|
||||
# If no description found yet, try the original approach
|
||||
if not description:
|
||||
description_elem = soup.find(
|
||||
["div", "section"],
|
||||
class_=lambda c: c
|
||||
and any(
|
||||
term in (c or "").lower()
|
||||
for term in ["job-description", "details", "requirements"]
|
||||
),
|
||||
)
|
||||
if description_elem:
|
||||
description_elem = remove_attributes(description_elem)
|
||||
description = description_elem.prettify(formatter="html")
|
||||
if (
|
||||
hasattr(self.scraper_input, "description_format")
|
||||
and self.scraper_input.description_format
|
||||
== DescriptionFormat.MARKDOWN
|
||||
):
|
||||
description = markdown_converter(description)
|
||||
|
||||
# Extract job type
|
||||
job_type_elem = soup.find(
|
||||
["span", "div"],
|
||||
string=lambda s: s
|
||||
and any(
|
||||
term in (s or "").lower()
|
||||
for term in ["job type", "employment type"]
|
||||
),
|
||||
)
|
||||
job_type = None
|
||||
if job_type_elem:
|
||||
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(
|
||||
strip=True
|
||||
)
|
||||
job_type = job_type_text if job_type_text else None
|
||||
|
||||
# Extract company industry
|
||||
industry_elem = soup.find(
|
||||
["span", "div"], string=lambda s: s and "industry" in (s or "").lower()
|
||||
)
|
||||
company_industry = None
|
||||
if industry_elem:
|
||||
industry_text = industry_elem.find_next(["span", "div"]).get_text(
|
||||
strip=True
|
||||
)
|
||||
company_industry = industry_text if industry_text else None
|
||||
|
||||
return {
|
||||
"description": description,
|
||||
"job_type": job_type,
|
||||
"company_industry": company_industry,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error getting job details: {str(e)}")
|
||||
return {}
|
||||
32
jobspy/bdjobs/constant.py
Normal file
32
jobspy/bdjobs/constant.py
Normal file
@@ -0,0 +1,32 @@
|
||||
#constant.py
|
||||
# Headers for BDJobs requests
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Connection": "keep-alive",
|
||||
"Referer": "https://jobs.bdjobs.com/",
|
||||
"Cache-Control": "max-age=0",
|
||||
}
|
||||
|
||||
# Search parameters that work best for BDJobs
|
||||
search_params = {
|
||||
"hidJobSearch": "jobsearch",
|
||||
}
|
||||
|
||||
# Selectors for job listings
|
||||
job_selectors = [
|
||||
"div.job-item", # Catches both normal and premium job cards, as well as other types
|
||||
"div.sout-jobs-wrapper", # Catches job listings in the main search results page
|
||||
"div.norm-jobs-wrapper", # Catches normal job listings
|
||||
"div.featured-wrap", # Catches featured job listings
|
||||
]
|
||||
|
||||
# Date formats used by BDJobs
|
||||
date_formats = [
|
||||
"%d %b %Y",
|
||||
"%d-%b-%Y",
|
||||
"%d %B %Y",
|
||||
"%B %d, %Y",
|
||||
"%d/%m/%Y",
|
||||
]
|
||||
100
jobspy/bdjobs/util.py
Normal file
100
jobspy/bdjobs/util.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#util.py
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
from jobspy.model import Location, Country
|
||||
|
||||
|
||||
def parse_location(location_text: str, country: str = "bangladesh") -> Location:
|
||||
"""
|
||||
Parses location text into a Location object
|
||||
:param location_text: Location text from job listing
|
||||
:param country: Default country
|
||||
:return: Location object
|
||||
"""
|
||||
parts = location_text.split(",")
|
||||
if len(parts) >= 2:
|
||||
city = parts[0].strip()
|
||||
state = parts[1].strip()
|
||||
return Location(
|
||||
city=city,
|
||||
state=state,
|
||||
country=Country.from_string(country)
|
||||
)
|
||||
else:
|
||||
return Location(
|
||||
city=location_text.strip(),
|
||||
country=Country.from_string(country)
|
||||
)
|
||||
|
||||
|
||||
def parse_date(date_text: str) -> Optional[datetime]:
|
||||
"""
|
||||
Parses date text into a datetime object
|
||||
:param date_text: Date text from job listing
|
||||
:return: datetime object or None if parsing fails
|
||||
"""
|
||||
from .constant import date_formats
|
||||
|
||||
try:
|
||||
# Clean up date text
|
||||
if "Deadline:" in date_text:
|
||||
date_text = date_text.replace("Deadline:", "").strip()
|
||||
|
||||
# Try different date formats
|
||||
for fmt in date_formats:
|
||||
try:
|
||||
return datetime.strptime(date_text, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_job_listings(soup: BeautifulSoup) -> List[Any]:
|
||||
"""
|
||||
Finds job listing elements in the HTML
|
||||
:param soup: BeautifulSoup object
|
||||
:return: List of job card elements
|
||||
"""
|
||||
from .constant import job_selectors
|
||||
|
||||
# Try different selectors
|
||||
for selector in job_selectors:
|
||||
if "." in selector:
|
||||
tag_name, class_name = selector.split(".", 1)
|
||||
elements = soup.find_all(tag_name, class_=class_name)
|
||||
if elements and len(elements) > 0:
|
||||
return elements
|
||||
|
||||
# If no selectors match, look for job detail links
|
||||
job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower())
|
||||
if job_links:
|
||||
# Return parent elements of job links
|
||||
return [link.parent for link in job_links]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def is_job_remote(title: str, description: str = None, location: Location = None) -> bool:
|
||||
"""
|
||||
Determines if a job is remote based on title, description, and location
|
||||
:param title: Job title
|
||||
:param description: Job description
|
||||
:param location: Job location
|
||||
:return: True if job is remote, False otherwise
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh", "home based"]
|
||||
|
||||
# Combine all text fields
|
||||
full_text = title.lower()
|
||||
if description:
|
||||
full_text += " " + description.lower()
|
||||
if location:
|
||||
full_text += " " + location.display_location().lower()
|
||||
|
||||
# Check for remote keywords
|
||||
return any(keyword in full_text for keyword in remote_keywords)
|
||||
@@ -34,3 +34,12 @@ class GoogleJobsException(Exception):
|
||||
class BaytException(Exception):
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with Bayt")
|
||||
|
||||
class NaukriException(Exception):
|
||||
def __init__(self,message=None):
|
||||
super().__init__(message or "An error occurred with Naukri")
|
||||
|
||||
|
||||
class BDJobsException(Exception):
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with BDJobs")
|
||||
@@ -34,13 +34,13 @@ log = create_logger("Glassdoor")
|
||||
|
||||
class Glassdoor(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||
"""
|
||||
site = Site(Site.GLASSDOOR)
|
||||
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
||||
super().__init__(site, proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
|
||||
|
||||
self.base_url = None
|
||||
self.country = None
|
||||
@@ -65,6 +65,8 @@ class Glassdoor(Scraper):
|
||||
)
|
||||
token = self._get_csrf_token()
|
||||
headers["gd-csrf-token"] = token if token else fallback_token
|
||||
if self.user_agent:
|
||||
headers["user-agent"] = self.user_agent
|
||||
self.session.headers.update(headers)
|
||||
|
||||
location_id, location_type = self._get_location(
|
||||
|
||||
@@ -13,7 +13,7 @@ headers = {
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
}
|
||||
query_template = """
|
||||
query JobSearchResultsQuery(
|
||||
|
||||
@@ -22,7 +22,7 @@ from jobspy.google.util import log, find_job_info_initial_page, find_job_info
|
||||
|
||||
class Google(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes Google Scraper with the Goodle jobs search url
|
||||
|
||||
@@ -28,7 +28,7 @@ log = create_logger("Indeed")
|
||||
|
||||
class Indeed(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes IndeedScraper with the Indeed API url
|
||||
|
||||
@@ -20,7 +20,7 @@ def get_job_type(attributes: list) -> list[JobType]:
|
||||
def get_compensation(compensation: dict) -> Compensation | None:
|
||||
"""
|
||||
Parses the job to get compensation
|
||||
:param sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrompensation:
|
||||
:param compensation:
|
||||
:return: compensation object
|
||||
"""
|
||||
if not compensation["baseSalary"] and not compensation["estimated"]:
|
||||
|
||||
@@ -14,10 +14,11 @@ from bs4.element import Tag
|
||||
from jobspy.exception import LinkedInException
|
||||
from jobspy.linkedin.constant import headers
|
||||
from jobspy.linkedin.util import (
|
||||
is_job_remote,
|
||||
job_type_code,
|
||||
parse_job_type,
|
||||
parse_job_level,
|
||||
parse_company_industry,
|
||||
parse_company_industry
|
||||
)
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
@@ -49,7 +50,7 @@ class LinkedIn(Scraper):
|
||||
jobs_per_page = 25
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes LinkedInScraper with the LinkedIn job search url
|
||||
@@ -173,7 +174,7 @@ class LinkedIn(Scraper):
|
||||
) -> Optional[JobPost]:
|
||||
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
||||
|
||||
compensation = None
|
||||
compensation = description = None
|
||||
if salary_tag:
|
||||
salary_text = salary_tag.get_text(separator=" ").strip()
|
||||
salary_values = [currency_parser(value) for value in salary_text.split("-")]
|
||||
@@ -217,6 +218,8 @@ class LinkedIn(Scraper):
|
||||
job_details = {}
|
||||
if full_descr:
|
||||
job_details = self._get_job_details(job_id)
|
||||
description = job_details.get("description")
|
||||
is_remote = is_job_remote(title, description, location)
|
||||
|
||||
return JobPost(
|
||||
id=f"li-{job_id}",
|
||||
@@ -224,6 +227,7 @@ class LinkedIn(Scraper):
|
||||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
is_remote=is_remote,
|
||||
date_posted=date_posted,
|
||||
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
||||
compensation=compensation,
|
||||
@@ -232,7 +236,7 @@ class LinkedIn(Scraper):
|
||||
company_industry=job_details.get("company_industry"),
|
||||
description=job_details.get("description"),
|
||||
job_url_direct=job_details.get("job_url_direct"),
|
||||
emails=extract_emails_from_text(job_details.get("description")),
|
||||
emails=extract_emails_from_text(description),
|
||||
company_logo=job_details.get("company_logo"),
|
||||
job_function=job_details.get("job_function"),
|
||||
)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from jobspy.model import JobType
|
||||
from jobspy.model import JobType, Location
|
||||
from jobspy.util import get_enum_from_job_type
|
||||
|
||||
|
||||
@@ -83,3 +83,14 @@ def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
||||
industry = industry_span.get_text(strip=True)
|
||||
|
||||
return industry
|
||||
|
||||
|
||||
def is_job_remote(title: dict, description: str, location: Location) -> bool:
|
||||
"""
|
||||
Searches the title, location, and description to check if job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
location = location.display_location()
|
||||
full_string = f'{title} {description} {location}'.lower()
|
||||
is_remote = any(keyword in full_string for keyword in remote_keywords)
|
||||
return is_remote
|
||||
|
||||
@@ -68,17 +68,22 @@ class Country(Enum):
|
||||
AUSTRALIA = ("australia", "au", "com.au")
|
||||
AUSTRIA = ("austria", "at", "at")
|
||||
BAHRAIN = ("bahrain", "bh")
|
||||
BANGLADESH = ("bangladesh", "bd") # Added Bangladesh
|
||||
BELGIUM = ("belgium", "be", "fr:be")
|
||||
BULGARIA = ("bulgaria", "bg")
|
||||
BRAZIL = ("brazil", "br", "com.br")
|
||||
CANADA = ("canada", "ca", "ca")
|
||||
CHILE = ("chile", "cl")
|
||||
CHINA = ("china", "cn")
|
||||
COLOMBIA = ("colombia", "co")
|
||||
COSTARICA = ("costa rica", "cr")
|
||||
CROATIA = ("croatia", "hr")
|
||||
CYPRUS = ("cyprus", "cy")
|
||||
CZECHREPUBLIC = ("czech republic,czechia", "cz")
|
||||
DENMARK = ("denmark", "dk")
|
||||
ECUADOR = ("ecuador", "ec")
|
||||
EGYPT = ("egypt", "eg")
|
||||
ESTONIA = ("estonia", "ee")
|
||||
FINLAND = ("finland", "fi")
|
||||
FRANCE = ("france", "fr", "fr")
|
||||
GERMANY = ("germany", "de", "de")
|
||||
@@ -92,6 +97,8 @@ class Country(Enum):
|
||||
ITALY = ("italy", "it", "it")
|
||||
JAPAN = ("japan", "jp")
|
||||
KUWAIT = ("kuwait", "kw")
|
||||
LATVIA = ("latvia", "lv")
|
||||
LITHUANIA = ("lithuania", "lt")
|
||||
LUXEMBOURG = ("luxembourg", "lu")
|
||||
MALAYSIA = ("malaysia", "malaysia:my", "com")
|
||||
MALTA = ("malta", "malta:mt", "mt")
|
||||
@@ -112,6 +119,8 @@ class Country(Enum):
|
||||
ROMANIA = ("romania", "ro")
|
||||
SAUDIARABIA = ("saudi arabia", "sa")
|
||||
SINGAPORE = ("singapore", "sg", "sg")
|
||||
SLOVAKIA = ("slovakia", "sk")
|
||||
SLOVENIA = ("slovenia", "sl")
|
||||
SOUTHAFRICA = ("south africa", "za")
|
||||
SOUTHKOREA = ("south korea", "kr")
|
||||
SPAIN = ("spain", "es", "es")
|
||||
@@ -246,13 +255,13 @@ class JobPost(BaseModel):
|
||||
is_remote: bool | None = None
|
||||
listing_type: str | None = None
|
||||
|
||||
# linkedin specific
|
||||
# LinkedIn specific
|
||||
job_level: str | None = None
|
||||
|
||||
# linkedin and indeed specific
|
||||
# LinkedIn and Indeed specific
|
||||
company_industry: str | None = None
|
||||
|
||||
# indeed specific
|
||||
# Indeed specific
|
||||
company_addresses: str | None = None
|
||||
company_num_employees: str | None = None
|
||||
company_revenue: str | None = None
|
||||
@@ -260,9 +269,16 @@ class JobPost(BaseModel):
|
||||
company_logo: str | None = None
|
||||
banner_photo_url: str | None = None
|
||||
|
||||
# linkedin only atm
|
||||
# LinkedIn only atm
|
||||
job_function: str | None = None
|
||||
|
||||
# Naukri specific
|
||||
skills: list[str] | None = None #from tagsAndSkills
|
||||
experience_range: str | None = None #from experienceText
|
||||
company_rating: float | None = None #from ambitionBoxData.AggregateRating
|
||||
company_reviews_count: int | None = None #from ambitionBoxData.ReviewsCount
|
||||
vacancy_count: int | None = None #from vacancy
|
||||
work_from_home_type: str | None = None #from clusters.wfhType (e.g., "Hybrid", "Remote")
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
jobs: list[JobPost] = []
|
||||
@@ -275,6 +291,8 @@ class Site(Enum):
|
||||
GLASSDOOR = "glassdoor"
|
||||
GOOGLE = "google"
|
||||
BAYT = "bayt"
|
||||
NAUKRI = "naukri"
|
||||
BDJOBS = "bdjobs" # Add this line
|
||||
|
||||
|
||||
class SalarySource(Enum):
|
||||
@@ -298,17 +316,20 @@ class ScraperInput(BaseModel):
|
||||
linkedin_company_ids: list[int] | None = None
|
||||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||||
|
||||
request_timeout: int = 60
|
||||
|
||||
results_wanted: int = 15
|
||||
hours_old: int | None = None
|
||||
|
||||
|
||||
class Scraper(ABC):
|
||||
def __init__(
|
||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
self.site = site
|
||||
self.proxies = proxies
|
||||
self.ca_cert = ca_cert
|
||||
self.user_agent = user_agent
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||
|
||||
301
jobspy/naukri/__init__.py
Normal file
301
jobspy/naukri/__init__.py
Normal file
@@ -0,0 +1,301 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import random
|
||||
import time
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
import regex as re
|
||||
import requests
|
||||
|
||||
from jobspy.exception import NaukriException
|
||||
from jobspy.naukri.constant import headers as naukri_headers
|
||||
from jobspy.naukri.util import (
|
||||
is_job_remote,
|
||||
parse_job_type,
|
||||
parse_company_industry,
|
||||
)
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
Country,
|
||||
Compensation,
|
||||
DescriptionFormat,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
)
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
currency_parser,
|
||||
markdown_converter,
|
||||
create_session,
|
||||
create_logger,
|
||||
)
|
||||
|
||||
log = create_logger("Naukri")
|
||||
|
||||
class Naukri(Scraper):
|
||||
base_url = "https://www.naukri.com/jobapi/v3/search"
|
||||
delay = 3
|
||||
band_delay = 4
|
||||
jobs_per_page = 20
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes NaukriScraper with the Naukri API URL
|
||||
"""
|
||||
super().__init__(Site.NAUKRI, proxies=proxies, ca_cert=ca_cert)
|
||||
self.session = create_session(
|
||||
proxies=self.proxies,
|
||||
ca_cert=ca_cert,
|
||||
is_tls=False,
|
||||
has_retry=True,
|
||||
delay=5,
|
||||
clear_cookies=True,
|
||||
)
|
||||
self.session.headers.update(naukri_headers)
|
||||
self.scraper_input = None
|
||||
self.country = "India" #naukri is india-focused by default
|
||||
log.info("Naukri scraper initialized")
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes Naukri API for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
:return: job_response
|
||||
"""
|
||||
self.scraper_input = scraper_input
|
||||
job_list: list[JobPost] = []
|
||||
seen_ids = set()
|
||||
start = scraper_input.offset or 0
|
||||
page = (start // self.jobs_per_page) + 1
|
||||
request_count = 0
|
||||
seconds_old = (
|
||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||
)
|
||||
continue_search = (
|
||||
lambda: len(job_list) < scraper_input.results_wanted and page <= 50 # Arbitrary limit
|
||||
)
|
||||
|
||||
while continue_search():
|
||||
request_count += 1
|
||||
log.info(
|
||||
f"Scraping page {request_count} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)} "
|
||||
f"for search term: {scraper_input.search_term}"
|
||||
)
|
||||
params = {
|
||||
"noOfResults": self.jobs_per_page,
|
||||
"urlType": "search_by_keyword",
|
||||
"searchType": "adv",
|
||||
"keyword": scraper_input.search_term,
|
||||
"pageNo": page,
|
||||
"k": scraper_input.search_term,
|
||||
"seoKey": f"{scraper_input.search_term.lower().replace(' ', '-')}-jobs",
|
||||
"src": "jobsearchDesk",
|
||||
"latLong": "",
|
||||
"location": scraper_input.location,
|
||||
"remote": "true" if scraper_input.is_remote else None,
|
||||
}
|
||||
if seconds_old:
|
||||
params["days"] = seconds_old // 86400 # Convert to days
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
try:
|
||||
log.debug(f"Sending request to {self.base_url} with params: {params}")
|
||||
response = self.session.get(self.base_url, params=params, timeout=10)
|
||||
if response.status_code not in range(200, 400):
|
||||
err = f"Naukri API response status code {response.status_code} - {response.text}"
|
||||
log.error(err)
|
||||
return JobResponse(jobs=job_list)
|
||||
data = response.json()
|
||||
job_details = data.get("jobDetails", [])
|
||||
log.info(f"Received {len(job_details)} job entries from API")
|
||||
if not job_details:
|
||||
log.warning("No job details found in API response")
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Naukri API request failed: {str(e)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
for job in job_details:
|
||||
job_id = job.get("jobId")
|
||||
if not job_id or job_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(job_id)
|
||||
log.debug(f"Processing job ID: {job_id}")
|
||||
|
||||
try:
|
||||
fetch_desc = scraper_input.linkedin_fetch_description
|
||||
job_post = self._process_job(job, job_id, fetch_desc)
|
||||
if job_post:
|
||||
job_list.append(job_post)
|
||||
log.info(f"Added job: {job_post.title} (ID: {job_id})")
|
||||
if not continue_search():
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Error processing job ID {job_id}: {str(e)}")
|
||||
raise NaukriException(str(e))
|
||||
|
||||
if continue_search():
|
||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||
page += 1
|
||||
|
||||
job_list = job_list[:scraper_input.results_wanted]
|
||||
log.info(f"Scraping completed. Total jobs collected: {len(job_list)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _process_job(
|
||||
self, job: dict, job_id: str, full_descr: bool
|
||||
) -> Optional[JobPost]:
|
||||
"""
|
||||
Processes a single job from API response into a JobPost object
|
||||
"""
|
||||
title = job.get("title", "N/A")
|
||||
company = job.get("companyName", "N/A")
|
||||
company_url = f"https://www.naukri.com/{job.get('staticUrl', '')}" if job.get("staticUrl") else None
|
||||
|
||||
location = self._get_location(job.get("placeholders", []))
|
||||
compensation = self._get_compensation(job.get("placeholders", []))
|
||||
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
|
||||
|
||||
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
|
||||
description = job.get("jobDescription") if full_descr else None
|
||||
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
|
||||
job_type = parse_job_type(description) if description else None
|
||||
company_industry = parse_company_industry(description) if description else None
|
||||
is_remote = is_job_remote(title, description or "", location)
|
||||
company_logo = job.get("logoPathV3") or job.get("logoPath")
|
||||
|
||||
# Naukri-specific fields
|
||||
skills = job.get("tagsAndSkills", "").split(",") if job.get("tagsAndSkills") else None
|
||||
experience_range = job.get("experienceText")
|
||||
ambition_box = job.get("ambitionBoxData", {})
|
||||
company_rating = float(ambition_box.get("AggregateRating")) if ambition_box.get("AggregateRating") else None
|
||||
company_reviews_count = ambition_box.get("ReviewsCount")
|
||||
vacancy_count = job.get("vacancy")
|
||||
work_from_home_type = self._infer_work_from_home_type(job.get("placeholders", []), title, description or "")
|
||||
|
||||
job_post = JobPost(
|
||||
id=f"nk-{job_id}",
|
||||
title=title,
|
||||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
is_remote=is_remote,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
compensation=compensation,
|
||||
job_type=job_type,
|
||||
company_industry=company_industry,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description or ""),
|
||||
company_logo=company_logo,
|
||||
skills=skills,
|
||||
experience_range=experience_range,
|
||||
company_rating=company_rating,
|
||||
company_reviews_count=company_reviews_count,
|
||||
vacancy_count=vacancy_count,
|
||||
work_from_home_type=work_from_home_type,
|
||||
)
|
||||
log.debug(f"Processed job: {title} at {company}")
|
||||
return job_post
|
||||
|
||||
def _get_location(self, placeholders: list[dict]) -> Location:
|
||||
"""
|
||||
Extracts location data from placeholders
|
||||
"""
|
||||
location = Location(country=Country.INDIA)
|
||||
for placeholder in placeholders:
|
||||
if placeholder.get("type") == "location":
|
||||
location_str = placeholder.get("label", "")
|
||||
parts = location_str.split(", ")
|
||||
city = parts[0] if parts else None
|
||||
state = parts[1] if len(parts) > 1 else None
|
||||
location = Location(city=city, state=state, country=Country.INDIA)
|
||||
log.debug(f"Parsed location: {location.display_location()}")
|
||||
break
|
||||
return location
|
||||
|
||||
def _get_compensation(self, placeholders: list[dict]) -> Optional[Compensation]:
|
||||
"""
|
||||
Extracts compensation data from placeholders, handling Indian salary formats (Lakhs, Crores)
|
||||
"""
|
||||
for placeholder in placeholders:
|
||||
if placeholder.get("type") == "salary":
|
||||
salary_text = placeholder.get("label", "").strip()
|
||||
if salary_text == "Not disclosed":
|
||||
log.debug("Salary not disclosed")
|
||||
return None
|
||||
|
||||
# Handle Indian salary formats (e.g., "12-16 Lacs P.A.", "1-5 Cr")
|
||||
salary_match = re.match(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*(Lacs|Lakh|Cr)\s*(P\.A\.)?", salary_text, re.IGNORECASE)
|
||||
if salary_match:
|
||||
min_salary, max_salary, unit = salary_match.groups()[:3]
|
||||
min_salary, max_salary = float(min_salary), float(max_salary)
|
||||
currency = "INR"
|
||||
|
||||
# Convert to base units (INR)
|
||||
if unit.lower() in ("lacs", "lakh"):
|
||||
min_salary *= 100000 # 1 Lakh = 100,000 INR
|
||||
max_salary *= 100000
|
||||
elif unit.lower() == "cr":
|
||||
min_salary *= 10000000 # 1 Crore = 10,000,000 INR
|
||||
max_salary *= 10000000
|
||||
|
||||
log.debug(f"Parsed salary: {min_salary} - {max_salary} INR")
|
||||
return Compensation(
|
||||
min_amount=int(min_salary),
|
||||
max_amount=int(max_salary),
|
||||
currency=currency,
|
||||
)
|
||||
else:
|
||||
log.debug(f"Could not parse salary: {salary_text}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _parse_date(self, label: str, created_date: int) -> Optional[date]:
|
||||
"""
|
||||
Parses date from footerPlaceholderLabel or createdDate, returning a date object
|
||||
"""
|
||||
today = datetime.now()
|
||||
if not label:
|
||||
if created_date:
|
||||
return datetime.fromtimestamp(created_date / 1000).date() # Convert to date
|
||||
return None
|
||||
label = label.lower()
|
||||
if "today" in label or "just now" in label or "few hours" in label:
|
||||
log.debug("Date parsed as today")
|
||||
return today.date()
|
||||
elif "ago" in label:
|
||||
match = re.search(r"(\d+)\s*day", label)
|
||||
if match:
|
||||
days = int(match.group(1))
|
||||
parsed_date = (today - timedelta(days = days)).date()
|
||||
log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
|
||||
return parsed_date
|
||||
elif created_date:
|
||||
parsed_date = datetime.fromtimestamp(created_date / 1000).date()
|
||||
log.debug(f"Date parsed from timestamp: {parsed_date}")
|
||||
return parsed_date
|
||||
log.debug("No date parsed")
|
||||
return None
|
||||
|
||||
def _infer_work_from_home_type(self, placeholders: list[dict], title: str, description: str) -> Optional[str]:
|
||||
"""
|
||||
Infers work-from-home type from job data (e.g., 'Hybrid', 'Remote', 'Work from office')
|
||||
"""
|
||||
location_str = next((p["label"] for p in placeholders if p["type"] == "location"), "").lower()
|
||||
if "hybrid" in location_str or "hybrid" in title.lower() or "hybrid" in description.lower():
|
||||
return "Hybrid"
|
||||
elif "remote" in location_str or "remote" in title.lower() or "remote" in description.lower():
|
||||
return "Remote"
|
||||
elif "work from office" in description.lower() or not ("remote" in description.lower() or "hybrid" in description.lower()):
|
||||
return "Work from office"
|
||||
return None
|
||||
11
jobspy/naukri/constant.py
Normal file
11
jobspy/naukri/constant.py
Normal file
@@ -0,0 +1,11 @@
|
||||
headers = {
|
||||
"authority": "www.naukri.com",
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"cache-control": "max-age=0",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"appid": "109",
|
||||
"systemid": "Naukri",
|
||||
"Nkparam": "Ppy0YK9uSHqPtG3bEejYc04RTpUN2CjJOrqA68tzQt0SKJHXZKzz9M8cZtKLVkoOuQmfe4cTb1r2CwfHaxW5Tg==",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
}
|
||||
34
jobspy/naukri/util.py
Normal file
34
jobspy/naukri/util.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from jobspy.model import JobType, Location
|
||||
from jobspy.util import get_enum_from_job_type
|
||||
|
||||
|
||||
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
|
||||
"""
|
||||
Gets the job type from the job page
|
||||
"""
|
||||
job_type_tag = soup.find("span", class_="job-type")
|
||||
if job_type_tag:
|
||||
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
|
||||
return [get_enum_from_job_type(job_type_str)] if job_type_str else None
|
||||
return None
|
||||
|
||||
|
||||
def parse_company_industry(soup: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the company industry from the job page
|
||||
"""
|
||||
industry_tag = soup.find("span", class_="industry")
|
||||
return industry_tag.get_text(strip=True) if industry_tag else None
|
||||
|
||||
|
||||
def is_job_remote(title: str, description: str, location: Location) -> bool:
|
||||
"""
|
||||
Searches the title, description, and location to check if the job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
location_str = location.display_location()
|
||||
full_string = f"{title} {description} {location_str}".lower()
|
||||
return any(keyword in full_string for keyword in remote_keywords)
|
||||
@@ -47,11 +47,12 @@ class RotatingProxySession:
|
||||
"""Utility method to format a proxy string into a dictionary."""
|
||||
if proxy.startswith("http://") or proxy.startswith("https://"):
|
||||
return {"http": proxy, "https": proxy}
|
||||
if proxy.startswith("socks5://"):
|
||||
return {"http": proxy, "https": proxy}
|
||||
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
|
||||
|
||||
|
||||
class RequestsRotating(RotatingProxySession, requests.Session):
|
||||
|
||||
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
|
||||
RotatingProxySession.__init__(self, proxies=proxies)
|
||||
requests.Session.__init__(self)
|
||||
@@ -86,7 +87,6 @@ class RequestsRotating(RotatingProxySession, requests.Session):
|
||||
|
||||
|
||||
class TLSRotating(RotatingProxySession, tls_client.Session):
|
||||
|
||||
def __init__(self, proxies=None):
|
||||
RotatingProxySession.__init__(self, proxies=proxies)
|
||||
tls_client.Session.__init__(self, random_tls_extension_order=True)
|
||||
@@ -344,4 +344,11 @@ desired_order = [
|
||||
"company_num_employees",
|
||||
"company_revenue",
|
||||
"company_description",
|
||||
# naukri-specific fields
|
||||
"skills",
|
||||
"experience_range",
|
||||
"company_rating",
|
||||
"company_reviews_count",
|
||||
"vacancy_count",
|
||||
"work_from_home_type",
|
||||
]
|
||||
|
||||
@@ -38,7 +38,7 @@ class ZipRecruiter(Scraper):
|
||||
api_url = "https://api.ziprecruiter.com"
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||
|
||||
@@ -4,12 +4,12 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.77"
|
||||
version = "1.1.82"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
||||
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||
readme = "README.md"
|
||||
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt"]
|
||||
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt", "naukri"]
|
||||
[[tool.poetry.packages]]
|
||||
include = "jobspy"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user