Compare commits

...

17 Commits

Author SHA1 Message Date
Cullen Watson
abd5878238 chore:readme 2025-07-28 17:15:09 +02:00
itsShrizon
ae2b1ea42c Bdjobs Fixed (#280) 2025-07-28 10:05:10 -05:00
Cullen Watson
53b3b41385 fix: glassdoor ua 2025-07-28 16:55:51 +02:00
Lê Trọng Tài
9aae02453d issue#270: glassdoor 403 response by rotating user-agent and updating headers (#274) 2025-07-28 09:55:05 -05:00
Piotr Geca
94d413bad1 support for socks5 proxies (#266)
Co-authored-by: Piotr Geca <piotr.geca@npl.co.uk>
2025-04-10 15:53:28 -05:00
Cullen Watson
61205bcc77 chore: version 2025-03-27 21:59:47 -05:00
Nikhil Sasi
f1602eca70 Fix date parsing error: prevent negative days by using timedelta (#264)
subtracting extracted "days" from label with current day causes negative days
datetime class rejects negative day association
Use timedelta for proper date limitation

Co-authored-by: NIKHIL S <nikhil_s@nikhilMac.local>
2025-03-27 21:58:42 -05:00
Cullen Watson
d4d52d05f5 chore:version 2025-03-21 17:35:23 -05:00
Liju Thomas
0946cb3373 feat: add naukri.com support (#259) 2025-03-21 17:23:07 -05:00
prudvisorra-aifa
051981689f Update util.py (#256) 2025-03-17 11:51:19 -05:00
Cullen Watson
903b7e6f1b fix(linkedin):is remote 2025-03-06 13:38:28 -06:00
Cullen Watson
6782b9884e fix:workflow 2025-03-01 14:49:31 -06:00
Cullen Watson
94c74d60f2 enh:workflow manual run 2025-03-01 14:47:24 -06:00
Cullen Watson
5463e5a664 chore:version 2025-03-01 14:38:25 -06:00
arkhy
ed139e7e6b added missing EU countries and languages (#250)
Co-authored-by: Kate Arkhangelskaya <ekar559e@tu-dresden.de>
2025-03-01 14:30:08 -06:00
Cullen Watson
5bd199d0a5 Merge branch 'main' of https://github.com/Bunsly/JobSpy 2025-02-21 14:15:06 -06:00
Cullen Watson
4ec308a302 refactor:organize code 2025-02-21 14:14:55 -06:00
32 changed files with 1518 additions and 647 deletions

View File

@@ -1,50 +1,37 @@
name: Publish Python 🐍 distributions 📦 to PyPI name: Publish JobSpy to PyPi
on: on:
pull_request: push:
types: branches:
- closed - main
workflow_dispatch:
permissions:
contents: write
jobs: jobs:
build-n-publish: build-n-publish:
name: Build and publish Python 🐍 distributions 📦 to PyPI name: Build and publish JobSpy to PyPi
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main'
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: "3.10" python-version: "3.10"
- name: Install dependencies
run: pip install toml
- name: Increment version
run: python increment_version.py
- name: Commit version increment
run: |
git config --global user.name 'github-actions'
git config --global user.email 'github-actions@github.com'
git add pyproject.toml
git commit -m 'Increment version'
- name: Push changes
run: git push
- name: Install poetry - name: Install poetry
run: pip install poetry --user run: >-
python3 -m
pip install
poetry
--user
- name: Build distribution 📦 - name: Build distribution 📦
run: poetry build run: >-
python3 -m
poetry
build
- name: Publish distribution 📦 to PyPI - name: Publish distribution 📦 to PyPI
if: startsWith(github.ref, 'refs/tags') || github.event_name == 'workflow_dispatch'
uses: pypa/gh-action-pypi-publish@release/v1 uses: pypa/gh-action-pypi-publish@release/v1
with: with:
password: ${{ secrets.PYPI_API_TOKEN }} password: ${{ secrets.PYPI_API_TOKEN }}

View File

@@ -4,7 +4,7 @@
## Features ## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently
- Aggregates the job postings in a dataframe - Aggregates the job postings in a dataframe
- Proxies support to bypass blocking - Proxies support to bypass blocking
@@ -25,7 +25,7 @@ import csv
from jobspy import scrape_jobs from jobspy import scrape_jobs
jobs = scrape_jobs( jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"], site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
search_term="software engineer", search_term="software engineer",
google_search_term="software engineer jobs near San Francisco, CA since yesterday", google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA", location="San Francisco, CA",
@@ -51,6 +51,7 @@ linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale
linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rains mission is to create the fastest and ea... linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rains mission is to create the fastest and ea...
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba... zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme... zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme...
``` ```
### Parameters for `scrape_jobs()` ### Parameters for `scrape_jobs()`
@@ -58,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext ```plaintext
Optional Optional
├── site_name (list|str): ├── site_name (list|str):
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt | linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs
| (default is all) | (default is all)
├── search_term (str) ├── search_term (str)
@@ -85,6 +86,10 @@ Optional
├── easy_apply (bool): ├── easy_apply (bool):
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works) | filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
├── user_agent (str):
| override the default user agent which may be outdated
|
├── description_format (str): ├── description_format (str):
| markdown, html (Format type of the job descriptions. Default is markdown.) | markdown, html (Format type of the job descriptions. Default is markdown.)
@@ -220,6 +225,7 @@ JobPost
│ ├── country │ ├── country
│ ├── city │ ├── city
│ ├── state │ ├── state
├── is_remote
├── description ├── description
├── job_type: fulltime, parttime, internship, contract ├── job_type: fulltime, parttime, internship, contract
├── job_function ├── job_function
@@ -229,8 +235,7 @@ JobPost
│ ├── currency │ ├── currency
│ └── salary_source: direct_data, description (parsed from posting) │ └── salary_source: direct_data, description (parsed from posting)
├── date_posted ├── date_posted
── emails ── emails
└── is_remote
Linkedin specific Linkedin specific
└── job_level └── job_level
@@ -245,4 +250,12 @@ Indeed specific
├── company_revenue_label ├── company_revenue_label
├── company_description ├── company_description
└── company_logo └── company_logo
Naukri specific
├── skills
├── experience_range
├── company_rating
├── company_reviews_count
├── vacancy_count
└── work_from_home_type
``` ```

View File

@@ -1,21 +0,0 @@
import toml
def increment_version(version):
major, minor, patch = map(int, version.split('.'))
patch += 1
return f"{major}.{minor}.{patch}"
# Load pyproject.toml
with open('pyproject.toml', 'r') as file:
pyproject = toml.load(file)
# Increment the version
current_version = pyproject['tool']['poetry']['version']
new_version = increment_version(current_version)
pyproject['tool']['poetry']['version'] = new_version
# Save the updated pyproject.toml
with open('pyproject.toml', 'w') as file:
toml.dump(pyproject, file)
print(f"Version updated from {current_version} to {new_version}")

View File

@@ -1,27 +1,33 @@
from __future__ import annotations from __future__ import annotations
import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple
from .jobs import JobType, Location import pandas as pd
from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper from jobspy.bayt import BaytScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper from jobspy.bdjobs import BDJobs
from .scrapers.glassdoor import GlassdoorScraper from jobspy.glassdoor import Glassdoor
from .scrapers.google import GoogleJobsScraper from jobspy.google import Google
from .scrapers.linkedin import LinkedInScraper from jobspy.indeed import Indeed
from .scrapers.bayt import BaytScraper from jobspy.linkedin import LinkedIn
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country from jobspy.naukri import Naukri
from .scrapers.exceptions import ( from jobspy.model import JobType, Location, JobResponse, Country
LinkedInException, from jobspy.model import SalarySource, ScraperInput, Site
IndeedException, from jobspy.util import (
ZipRecruiterException, set_logger_level,
GlassdoorException, extract_salary,
GoogleJobsException, create_logger,
get_enum_from_value,
map_str_to_site,
convert_to_annual,
desired_order,
) )
from jobspy.ziprecruiter import ZipRecruiter
# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None, site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None, search_term: str | None = None,
@@ -33,7 +39,6 @@ def scrape_jobs(
easy_apply: bool | None = None, easy_apply: bool | None = None,
results_wanted: int = 15, results_wanted: int = 15,
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False,
proxies: list[str] | str | None = None, proxies: list[str] | str | None = None,
ca_cert: str | None = None, ca_cert: str | None = None,
description_format: str = "markdown", description_format: str = "markdown",
@@ -43,31 +48,24 @@ def scrape_jobs(
hours_old: int = None, hours_old: int = None,
enforce_annual_salary: bool = False, enforce_annual_salary: bool = False,
verbose: int = 0, verbose: int = 0,
user_agent: str = None,
**kwargs, **kwargs,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Simultaneously scrapes job data from multiple job sites. Scrapes job data from job boards concurrently
:return: pandas dataframe containing job data :return: Pandas DataFrame containing job data
""" """
SCRAPER_MAPPING = { SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper, Site.LINKEDIN: LinkedIn,
Site.INDEED: IndeedScraper, Site.INDEED: Indeed,
Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.ZIP_RECRUITER: ZipRecruiter,
Site.GLASSDOOR: GlassdoorScraper, Site.GLASSDOOR: Glassdoor,
Site.GOOGLE: GoogleJobsScraper, Site.GOOGLE: Google,
Site.BAYT: BaytScraper, Site.BAYT: BaytScraper,
Site.NAUKRI: Naukri,
Site.BDJOBS: BDJobs, # Add BDJobs to the scraper mapping
} }
set_logger_level(verbose) set_logger_level(verbose)
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return job_type
raise Exception(f"Invalid job type: {value_str}")
job_type = get_enum_from_value(job_type) if job_type else None job_type = get_enum_from_value(job_type) if job_type else None
def get_site_type(): def get_site_type():
@@ -105,7 +103,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert) scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
@@ -127,28 +125,12 @@ def scrape_jobs(
site_value, scraped_data = future.result() site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data site_to_jobs_dict[site_value] = scraped_data
def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080
job_data["max_amount"] *= 2080
if job_data["interval"] == "monthly":
job_data["min_amount"] *= 12
job_data["max_amount"] *= 12
if job_data["interval"] == "weekly":
job_data["min_amount"] *= 52
job_data["max_amount"] *= 52
if job_data["interval"] == "daily":
job_data["min_amount"] *= 260
job_data["max_amount"] *= 260
job_data["interval"] = "yearly"
jobs_dfs: list[pd.DataFrame] = [] jobs_dfs: list[pd.DataFrame] = []
for site, job_response in site_to_jobs_dict.items(): for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs: for job in job_response.jobs:
job_data = job.dict() job_data = job.dict()
job_url = job_data["job_url"] job_url = job_data["job_url"]
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
job_data["site"] = site job_data["site"] = site
job_data["company"] = job_data["company_name"] job_data["company"] = job_data["company_name"]
job_data["job_type"] = ( job_data["job_type"] = (
@@ -164,6 +146,7 @@ def scrape_jobs(
**job_data["location"] **job_data["location"]
).display_location() ).display_location()
# Handle compensation
compensation_obj = job_data.get("compensation") compensation_obj = job_data.get("compensation")
if compensation_obj and isinstance(compensation_obj, dict): if compensation_obj and isinstance(compensation_obj, dict):
job_data["interval"] = ( job_data["interval"] = (
@@ -182,7 +165,6 @@ def scrape_jobs(
and job_data["max_amount"] and job_data["max_amount"]
): ):
convert_to_annual(job_data) convert_to_annual(job_data)
else: else:
if country_enum == Country.USA: if country_enum == Country.USA:
( (
@@ -201,6 +183,17 @@ def scrape_jobs(
if "min_amount" in job_data and job_data["min_amount"] if "min_amount" in job_data and job_data["min_amount"]
else None else None
) )
#naukri-specific fields
job_data["skills"] = (
", ".join(job_data["skills"]) if job_data["skills"] else None
)
job_data["experience_range"] = job_data.get("experience_range")
job_data["company_rating"] = job_data.get("company_rating")
job_data["company_reviews_count"] = job_data.get("company_reviews_count")
job_data["vacancy_count"] = job_data.get("vacancy_count")
job_data["work_from_home_type"] = job_data.get("work_from_home_type")
job_df = pd.DataFrame([job_data]) job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df) jobs_dfs.append(job_df)
@@ -211,38 +204,6 @@ def scrape_jobs(
# Step 2: Concatenate the filtered DataFrames # Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True) jobs_df = pd.concat(filtered_dfs, ignore_index=True)
# Desired column order
desired_order = [
"id",
"site",
"job_url_hyper" if hyperlinks else "job_url",
"job_url_direct",
"title",
"company",
"location",
"date_posted",
"job_type",
"salary_source",
"interval",
"min_amount",
"max_amount",
"currency",
"is_remote",
"job_level",
"job_function",
"listing_type",
"emails",
"description",
"company_industry",
"company_url",
"company_logo",
"company_url_direct",
"company_addresses",
"company_num_employees",
"company_revenue",
"company_description",
]
# Step 3: Ensure all desired columns are present, adding missing ones as empty # Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order: for column in desired_order:
if column not in jobs_df.columns: if column not in jobs_df.columns:
@@ -257,3 +218,9 @@ def scrape_jobs(
).reset_index(drop=True) ).reset_index(drop=True)
else: else:
return pd.DataFrame() return pd.DataFrame()
# Add BDJobs to __all__
__all__ = [
"BDJobs",
]

View File

@@ -1,10 +1,3 @@
"""
jobspy.scrapers.bayt
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Bayt.
"""
from __future__ import annotations from __future__ import annotations
import random import random
@@ -12,9 +5,16 @@ import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site from jobspy.model import (
from ..utils import create_logger, create_session Scraper,
from ...jobs import JobPost, JobResponse, Location, Country ScraperInput,
Site,
JobPost,
JobResponse,
Location,
Country,
)
from jobspy.util import create_logger, create_session
log = create_logger("Bayt") log = create_logger("Bayt")
@@ -25,7 +25,7 @@ class BaytScraper(Scraper):
band_delay = 3 band_delay = 3
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert) super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None self.scraper_input = None

353
jobspy/bdjobs/__init__.py Normal file
View File

@@ -0,0 +1,353 @@
# __init__.py
from __future__ import annotations
import random
import time
from datetime import datetime
from typing import Optional, List, Dict, Any
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from bs4.element import Tag
from jobspy.exception import BDJobsException
from jobspy.bdjobs.constant import headers, search_params
from jobspy.bdjobs.util import (
parse_location,
parse_date,
find_job_listings,
is_job_remote,
)
from jobspy.model import (
JobPost,
Location,
JobResponse,
Country,
Scraper,
ScraperInput,
Site,
DescriptionFormat,
)
from jobspy.util import (
extract_emails_from_text,
create_session,
create_logger,
remove_attributes,
markdown_converter,
)
log = create_logger("BDJobs")
class BDJobs(Scraper):
base_url = "https://jobs.bdjobs.com"
search_url = "https://jobs.bdjobs.com/jobsearch.asp"
delay = 2
band_delay = 3
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes BDJobsScraper with the BDJobs job search url
"""
super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
self.session = create_session(
proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
clear_cookies=True,
)
self.session.headers.update(headers)
self.scraper_input = None
self.country = "bangladesh"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes BDJobs for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
page = 1
request_count = 0
# Set up search parameters
params = search_params.copy()
params["txtsearch"] = scraper_input.search_term
continue_search = lambda: len(job_list) < scraper_input.results_wanted
while continue_search():
request_count += 1
log.info(f"search page: {request_count}")
try:
# Add page parameter if needed
if page > 1:
params["pg"] = page
response = self.session.get(
self.search_url,
params=params,
timeout=getattr(scraper_input, "request_timeout", 60),
)
if response.status_code != 200:
log.error(f"BDJobs response status code {response.status_code}")
break
soup = BeautifulSoup(response.text, "html.parser")
job_cards = find_job_listings(soup)
if not job_cards or len(job_cards) == 0:
log.info("No more job listings found")
break
log.info(f"Found {len(job_cards)} job cards on page {page}")
for job_card in job_cards:
try:
job_post = self._process_job(job_card)
if job_post and job_post.id not in seen_ids:
seen_ids.add(job_post.id)
job_list.append(job_post)
if not continue_search():
break
except Exception as e:
log.error(f"Error processing job card: {str(e)}")
page += 1
# Add delay between requests
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
except Exception as e:
log.error(f"Error during scraping: {str(e)}")
break
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
"""
Processes a job card element into a JobPost object
:param job_card: Job card element
:return: JobPost object
"""
try:
# Extract job ID and URL
job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
if not job_link:
return None
job_url = job_link.get("href")
if not job_url.startswith("http"):
job_url = urljoin(self.base_url, job_url)
# Extract job ID from URL
job_id = (
job_url.split("jobid=")[-1].split("&")[0]
if "jobid=" in job_url
else f"bdjobs-{hash(job_url)}"
)
# Extract title
title = job_link.get_text(strip=True)
if not title:
title_elem = job_card.find(
["h2", "h3", "h4", "strong", "div"],
class_=lambda c: c and "job-title-text" in c,
)
title = title_elem.get_text(strip=True) if title_elem else "N/A"
# Extract company name - IMPROVED
company_elem = job_card.find(
["span", "div"],
class_=lambda c: c and "comp-name-text" in (c or "").lower(),
)
if company_elem:
company_name = company_elem.get_text(strip=True)
else:
# Try alternative selectors
company_elem = job_card.find(
["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["company", "org", "comp-name"]
),
)
company_name = (
company_elem.get_text(strip=True) if company_elem else "N/A"
)
# Extract location
location_elem = job_card.find(
["span", "div"],
class_=lambda c: c and "locon-text-d" in (c or "").lower(),
)
if not location_elem:
location_elem = job_card.find(
["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["location", "area", "locon"]
),
)
location_text = (
location_elem.get_text(strip=True)
if location_elem
else "Dhaka, Bangladesh"
)
# Create Location object
location = parse_location(location_text, self.country)
# Extract date posted
date_elem = job_card.find(
["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["date", "deadline", "published"]
),
)
date_posted = None
if date_elem:
date_text = date_elem.get_text(strip=True)
date_posted = parse_date(date_text)
# Check if job is remote
is_remote = is_job_remote(title, location=location)
# Create job post object
job_post = JobPost(
id=job_id,
title=title,
company_name=company_name, # Use company_name instead of company
location=location,
date_posted=date_posted,
job_url=job_url,
is_remote=is_remote,
site=self.site,
)
# Always fetch description for BDJobs
job_details = self._get_job_details(job_url)
job_post.description = job_details.get("description", "")
job_post.job_type = job_details.get("job_type", "")
return job_post
except Exception as e:
log.error(f"Error in _process_job: {str(e)}")
return None
def _get_job_details(self, job_url: str) -> Dict[str, Any]:
"""
Gets detailed job information from the job page
:param job_url: Job page URL
:return: Dictionary with job details
"""
try:
response = self.session.get(job_url, timeout=60)
if response.status_code != 200:
return {}
soup = BeautifulSoup(response.text, "html.parser")
# Find job description - IMPROVED based on correct.py
description = ""
# Try to find the job content div first (as in correct.py)
job_content_div = soup.find("div", class_="jobcontent")
if job_content_div:
# Look for responsibilities section
responsibilities_heading = job_content_div.find(
"h4", id="job_resp"
) or job_content_div.find(
["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower()
)
if responsibilities_heading:
responsibilities_elements = []
# Find all following elements until the next heading or hr
for sibling in responsibilities_heading.find_next_siblings():
if sibling.name in ["hr", "h4", "h5"]:
break
if sibling.name == "ul":
responsibilities_elements.extend(
li.get_text(separator=" ", strip=True)
for li in sibling.find_all("li")
)
elif sibling.name == "p":
responsibilities_elements.append(
sibling.get_text(separator=" ", strip=True)
)
description = (
"\n".join(responsibilities_elements)
if responsibilities_elements
else ""
)
# If no description found yet, try the original approach
if not description:
description_elem = soup.find(
["div", "section"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["job-description", "details", "requirements"]
),
)
if description_elem:
description_elem = remove_attributes(description_elem)
description = description_elem.prettify(formatter="html")
if (
hasattr(self.scraper_input, "description_format")
and self.scraper_input.description_format
== DescriptionFormat.MARKDOWN
):
description = markdown_converter(description)
# Extract job type
job_type_elem = soup.find(
["span", "div"],
string=lambda s: s
and any(
term in (s or "").lower()
for term in ["job type", "employment type"]
),
)
job_type = None
if job_type_elem:
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(
strip=True
)
job_type = job_type_text if job_type_text else None
# Extract company industry
industry_elem = soup.find(
["span", "div"], string=lambda s: s and "industry" in (s or "").lower()
)
company_industry = None
if industry_elem:
industry_text = industry_elem.find_next(["span", "div"]).get_text(
strip=True
)
company_industry = industry_text if industry_text else None
return {
"description": description,
"job_type": job_type,
"company_industry": company_industry,
}
except Exception as e:
log.error(f"Error getting job details: {str(e)}")
return {}

32
jobspy/bdjobs/constant.py Normal file
View File

@@ -0,0 +1,32 @@
#constant.py
# Headers for BDJobs requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Referer": "https://jobs.bdjobs.com/",
"Cache-Control": "max-age=0",
}
# Search parameters that work best for BDJobs
search_params = {
"hidJobSearch": "jobsearch",
}
# Selectors for job listings
job_selectors = [
"div.job-item", # Catches both normal and premium job cards, as well as other types
"div.sout-jobs-wrapper", # Catches job listings in the main search results page
"div.norm-jobs-wrapper", # Catches normal job listings
"div.featured-wrap", # Catches featured job listings
]
# Date formats used by BDJobs
date_formats = [
"%d %b %Y",
"%d-%b-%Y",
"%d %B %Y",
"%B %d, %Y",
"%d/%m/%Y",
]

100
jobspy/bdjobs/util.py Normal file
View File

@@ -0,0 +1,100 @@
#util.py
from bs4 import BeautifulSoup
from datetime import datetime
from typing import Optional, List, Dict, Any
from jobspy.model import Location, Country
def parse_location(location_text: str, country: str = "bangladesh") -> Location:
"""
Parses location text into a Location object
:param location_text: Location text from job listing
:param country: Default country
:return: Location object
"""
parts = location_text.split(",")
if len(parts) >= 2:
city = parts[0].strip()
state = parts[1].strip()
return Location(
city=city,
state=state,
country=Country.from_string(country)
)
else:
return Location(
city=location_text.strip(),
country=Country.from_string(country)
)
def parse_date(date_text: str) -> Optional[datetime]:
"""
Parses date text into a datetime object
:param date_text: Date text from job listing
:return: datetime object or None if parsing fails
"""
from .constant import date_formats
try:
# Clean up date text
if "Deadline:" in date_text:
date_text = date_text.replace("Deadline:", "").strip()
# Try different date formats
for fmt in date_formats:
try:
return datetime.strptime(date_text, fmt)
except ValueError:
continue
return None
except Exception:
return None
def find_job_listings(soup: BeautifulSoup) -> List[Any]:
"""
Finds job listing elements in the HTML
:param soup: BeautifulSoup object
:return: List of job card elements
"""
from .constant import job_selectors
# Try different selectors
for selector in job_selectors:
if "." in selector:
tag_name, class_name = selector.split(".", 1)
elements = soup.find_all(tag_name, class_=class_name)
if elements and len(elements) > 0:
return elements
# If no selectors match, look for job detail links
job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower())
if job_links:
# Return parent elements of job links
return [link.parent for link in job_links]
return []
def is_job_remote(title: str, description: str = None, location: Location = None) -> bool:
"""
Determines if a job is remote based on title, description, and location
:param title: Job title
:param description: Job description
:param location: Job location
:return: True if job is remote, False otherwise
"""
remote_keywords = ["remote", "work from home", "wfh", "home based"]
# Combine all text fields
full_text = title.lower()
if description:
full_text += " " + description.lower()
if location:
full_text += " " + location.display_location().lower()
# Check for remote keywords
return any(keyword in full_text for keyword in remote_keywords)

View File

@@ -1,5 +1,5 @@
""" """
jobspy.scrapers.exceptions jobspy.jobboard.exceptions
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains the set of Scrapers' exceptions. This module contains the set of Scrapers' exceptions.
@@ -34,3 +34,12 @@ class GoogleJobsException(Exception):
class BaytException(Exception): class BaytException(Exception):
def __init__(self, message=None): def __init__(self, message=None):
super().__init__(message or "An error occurred with Bayt") super().__init__(message or "An error occurred with Bayt")
class NaukriException(Exception):
def __init__(self,message=None):
super().__init__(message or "An error occurred with Naukri")
class BDJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with BDJobs")

View File

@@ -1,49 +1,46 @@
"""
jobspy.scrapers.glassdoor
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Glassdoor.
"""
from __future__ import annotations from __future__ import annotations
import re import re
import json import json
import requests import requests
from typing import Optional, Tuple from typing import Tuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .constants import fallback_token, query_template, headers from jobspy.glassdoor.constant import fallback_token, query_template, headers
from .. import Scraper, ScraperInput, Site from jobspy.glassdoor.util import (
from ..utils import extract_emails_from_text, create_logger get_cursor_for_page,
from ..exceptions import GlassdoorException parse_compensation,
from ..utils import ( parse_location,
)
from jobspy.util import (
extract_emails_from_text,
create_logger,
create_session, create_session,
markdown_converter, markdown_converter,
) )
from ...jobs import ( from jobspy.exception import GlassdoorException
from jobspy.model import (
JobPost, JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse, JobResponse,
JobType,
DescriptionFormat, DescriptionFormat,
Scraper,
ScraperInput,
Site,
) )
log = create_logger("Glassdoor") log = create_logger("Glassdoor")
class GlassdoorScraper(Scraper): class Glassdoor(Scraper):
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
site = Site(Site.GLASSDOOR) site = Site(Site.GLASSDOOR)
super().__init__(site, proxies=proxies, ca_cert=ca_cert) super().__init__(site, proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
self.base_url = None self.base_url = None
self.country = None self.country = None
@@ -68,6 +65,8 @@ class GlassdoorScraper(Scraper):
) )
token = self._get_csrf_token() token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token headers["gd-csrf-token"] = token if token else fallback_token
if self.user_agent:
headers["user-agent"] = self.user_agent
self.session.headers.update(headers) self.session.headers.update(headers)
location_id, location_type = self._get_location( location_id, location_type = self._get_location(
@@ -146,7 +145,7 @@ class GlassdoorScraper(Scraper):
except Exception as exc: except Exception as exc:
raise GlassdoorException(f"Glassdoor generated an exception: {exc}") raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
return jobs, self.get_cursor_for_page( return jobs, get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
@@ -185,9 +184,9 @@ class GlassdoorScraper(Scraper):
if location_type == "S": if location_type == "S":
is_remote = True is_remote = True
else: else:
location = self.parse_location(location_name) location = parse_location(location_name)
compensation = self.parse_compensation(job["header"]) compensation = parse_compensation(job["header"])
try: try:
description = self._fetch_job_description(job_id) description = self._fetch_job_description(job_id)
except: except:
@@ -321,44 +320,3 @@ class GlassdoorScraper(Scraper):
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]} {"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
) )
return json.dumps([payload]) return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]

View File

@@ -13,7 +13,7 @@ headers = {
"sec-fetch-dest": "empty", "sec-fetch-dest": "empty",
"sec-fetch-mode": "cors", "sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
} }
query_template = """ query_template = """
query JobSearchResultsQuery( query JobSearchResultsQuery(

42
jobspy/glassdoor/util.py Normal file
View File

@@ -0,0 +1,42 @@
from jobspy.model import Compensation, CompensationInterval, Location, JobType
def parse_compensation(data: dict) -> Compensation | None:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]

View File

@@ -1,10 +1,3 @@
"""
jobspy.scrapers.google
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Google.
"""
from __future__ import annotations from __future__ import annotations
import math import math
@@ -13,25 +6,23 @@ import json
from typing import Tuple from typing import Tuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from .constants import headers_jobs, headers_initial, async_param from jobspy.google.constant import headers_jobs, headers_initial, async_param
from .. import Scraper, ScraperInput, Site from jobspy.model import (
from ..utils import extract_emails_from_text, create_logger, extract_job_type Scraper,
from ..utils import ( ScraperInput,
create_session, Site,
)
from ...jobs import (
JobPost, JobPost,
JobResponse, JobResponse,
Location, Location,
JobType, JobType,
) )
from jobspy.util import extract_emails_from_text, extract_job_type, create_session
log = create_logger("Google") from jobspy.google.util import log, find_job_info_initial_page, find_job_info
class GoogleJobsScraper(Scraper): class Google(Scraper):
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes Google Scraper with the Goodle jobs search url Initializes Google Scraper with the Goodle jobs search url
@@ -135,7 +126,7 @@ class GoogleJobsScraper(Scraper):
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"' pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text) match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None data_async_fc = match_fc.group(1) if match_fc else None
jobs_raw = self._find_job_info_initial_page(response.text) jobs_raw = find_job_info_initial_page(response.text)
jobs = [] jobs = []
for job_raw in jobs_raw: for job_raw in jobs_raw:
job_post = self._parse_job(job_raw) job_post = self._parse_job(job_raw)
@@ -167,7 +158,7 @@ class GoogleJobsScraper(Scraper):
continue continue
job_d = json.loads(job_data) job_d = json.loads(job_data)
job_info = self._find_job_info(job_d) job_info = find_job_info(job_d)
job_post = self._parse_job(job_info) job_post = self._parse_job(job_info)
if job_post: if job_post:
jobs_on_page.append(job_post) jobs_on_page.append(job_post)
@@ -209,39 +200,3 @@ class GoogleJobsScraper(Scraper):
job_type=extract_job_type(description), job_type=extract_job_type(description),
) )
return job_post return job_post
@staticmethod
def _find_job_info(jobs_data: list | dict) -> list | None:
"""Iterates through the JSON data to find the job listings"""
if isinstance(jobs_data, dict):
for key, value in jobs_data.items():
if key == "520084652" and isinstance(value, list):
return value
else:
result = GoogleJobsScraper._find_job_info(value)
if result:
return result
elif isinstance(jobs_data, list):
for item in jobs_data:
result = GoogleJobsScraper._find_job_info(item)
if result:
return result
return None
@staticmethod
def _find_job_info_initial_page(html_text: str):
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
results = []
matches = re.finditer(pattern, html_text)
import json
for match in matches:
try:
parsed_data = json.loads(match.group(1))
results.append(parsed_data)
except json.JSONDecodeError as e:
log.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results

41
jobspy/google/util.py Normal file
View File

@@ -0,0 +1,41 @@
import re
from jobspy.util import create_logger
log = create_logger("Google")
def find_job_info(jobs_data: list | dict) -> list | None:
"""Iterates through the JSON data to find the job listings"""
if isinstance(jobs_data, dict):
for key, value in jobs_data.items():
if key == "520084652" and isinstance(value, list):
return value
else:
result = find_job_info(value)
if result:
return result
elif isinstance(jobs_data, list):
for item in jobs_data:
result = find_job_info(item)
if result:
return result
return None
def find_job_info_initial_page(html_text: str):
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
results = []
matches = re.finditer(pattern, html_text)
import json
for match in matches:
try:
parsed_data = json.loads(match.group(1))
results.append(parsed_data)
except json.JSONDecodeError as e:
log.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results

View File

@@ -1,41 +1,34 @@
"""
jobspy.scrapers.indeed
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Indeed.
"""
from __future__ import annotations from __future__ import annotations
import math import math
from typing import Tuple
from datetime import datetime from datetime import datetime
from typing import Tuple
from .constants import job_search_query, api_headers from jobspy.indeed.constant import job_search_query, api_headers
from .. import Scraper, ScraperInput, Site from jobspy.indeed.util import is_job_remote, get_compensation, get_job_type
from ..utils import ( from jobspy.model import (
extract_emails_from_text, Scraper,
get_enum_from_job_type, ScraperInput,
markdown_converter, Site,
create_session,
create_logger,
)
from ...jobs import (
JobPost, JobPost,
Compensation,
CompensationInterval,
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat, DescriptionFormat,
) )
from jobspy.util import (
extract_emails_from_text,
markdown_converter,
create_session,
create_logger,
)
log = create_logger("Indeed") log = create_logger("Indeed")
class IndeedScraper(Scraper): class Indeed(Scraper):
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes IndeedScraper with the Indeed API url Initializes IndeedScraper with the Indeed API url
@@ -213,7 +206,7 @@ class IndeedScraper(Scraper):
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description) description = markdown_converter(description)
job_type = self._get_job_type(job["attributes"]) job_type = get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000 timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d") date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
employer = job["employer"].get("dossier") if job["employer"] else None employer = job["employer"].get("dossier") if job["employer"] else None
@@ -234,14 +227,14 @@ class IndeedScraper(Scraper):
country=job.get("location", {}).get("countryCode"), country=job.get("location", {}).get("countryCode"),
), ),
job_type=job_type, job_type=job_type,
compensation=self._get_compensation(job["compensation"]), compensation=get_compensation(job["compensation"]),
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_url_direct=( job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None job["recruit"].get("viewJobUrl") if job.get("recruit") else None
), ),
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
is_remote=self._is_job_remote(job, description), is_remote=is_job_remote(job, description),
company_addresses=( company_addresses=(
employer_details["addresses"][0] employer_details["addresses"][0]
if employer_details.get("addresses") if employer_details.get("addresses")
@@ -265,86 +258,3 @@ class IndeedScraper(Scraper):
else None else None
), ),
) )
@staticmethod
def _get_job_type(attributes: list) -> list[JobType]:
"""
Parses the attributes to get list of job types
:param attributes:
:return: list of JobType
"""
job_types: list[JobType] = []
for attribute in attributes:
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
@staticmethod
def _get_compensation(compensation: dict) -> Compensation | None:
"""
Parses the job to get compensation
:param job:
:return: compensation object
"""
if not compensation["baseSalary"] and not compensation["estimated"]:
return None
comp = (
compensation["baseSalary"]
if compensation["baseSalary"]
else compensation["estimated"]["baseSalary"]
)
if not comp:
return None
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
if not interval:
return None
min_range = comp["range"].get("min")
max_range = comp["range"].get("max")
return Compensation(
interval=interval,
min_amount=int(min_range) if min_range is not None else None,
max_amount=int(max_range) if max_range is not None else None,
currency=(
compensation["estimated"]["currencyCode"]
if compensation["estimated"]
else compensation["currencyCode"]
),
)
@staticmethod
def _is_job_remote(job: dict, description: str) -> bool:
"""
Searches the description, location, and attributes to check if job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any(
any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
)
is_remote_in_location = any(
keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords
)
return (
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
)
@staticmethod
def _get_compensation_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY",
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")

83
jobspy/indeed/util.py Normal file
View File

@@ -0,0 +1,83 @@
from jobspy.model import CompensationInterval, JobType, Compensation
from jobspy.util import get_enum_from_job_type
def get_job_type(attributes: list) -> list[JobType]:
"""
Parses the attributes to get list of job types
:param attributes:
:return: list of JobType
"""
job_types: list[JobType] = []
for attribute in attributes:
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
def get_compensation(compensation: dict) -> Compensation | None:
"""
Parses the job to get compensation
:param compensation:
:return: compensation object
"""
if not compensation["baseSalary"] and not compensation["estimated"]:
return None
comp = (
compensation["baseSalary"]
if compensation["baseSalary"]
else compensation["estimated"]["baseSalary"]
)
if not comp:
return None
interval = get_compensation_interval(comp["unitOfWork"])
if not interval:
return None
min_range = comp["range"].get("min")
max_range = comp["range"].get("max")
return Compensation(
interval=interval,
min_amount=int(min_range) if min_range is not None else None,
max_amount=int(max_range) if max_range is not None else None,
currency=(
compensation["estimated"]["currencyCode"]
if compensation["estimated"]
else compensation["currencyCode"]
),
)
def is_job_remote(job: dict, description: str) -> bool:
"""
Searches the description, location, and attributes to check if job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any(
any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
)
is_remote_in_location = any(
keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords
)
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
def get_compensation_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY",
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")

View File

@@ -1,54 +1,56 @@
"""
jobspy.scrapers.linkedin
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape LinkedIn.
"""
from __future__ import annotations from __future__ import annotations
import math import math
import time
import random import random
import regex as re import time
from typing import Optional
from datetime import datetime from datetime import datetime
from typing import Optional
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote from urllib.parse import urlparse, urlunparse, unquote
from .constants import headers import regex as re
from .. import Scraper, ScraperInput, Site from bs4 import BeautifulSoup
from ..exceptions import LinkedInException from bs4.element import Tag
from ..utils import create_session, remove_attributes, create_logger
from ...jobs import ( from jobspy.exception import LinkedInException
from jobspy.linkedin.constant import headers
from jobspy.linkedin.util import (
is_job_remote,
job_type_code,
parse_job_type,
parse_job_level,
parse_company_industry
)
from jobspy.model import (
JobPost, JobPost,
Location, Location,
JobResponse, JobResponse,
JobType,
Country, Country,
Compensation, Compensation,
DescriptionFormat, DescriptionFormat,
Scraper,
ScraperInput,
Site,
) )
from ..utils import ( from jobspy.util import (
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type,
currency_parser, currency_parser,
markdown_converter, markdown_converter,
create_session,
remove_attributes,
create_logger,
) )
log = create_logger("LinkedIn") log = create_logger("LinkedIn")
class LinkedInScraper(Scraper): class LinkedIn(Scraper):
base_url = "https://www.linkedin.com" base_url = "https://www.linkedin.com"
delay = 3 delay = 3
band_delay = 4 band_delay = 4
jobs_per_page = 25 jobs_per_page = 25
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
@@ -95,7 +97,7 @@ class LinkedInScraper(Scraper):
"distance": scraper_input.distance, "distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None, "f_WT": 2 if scraper_input.is_remote else None,
"f_JT": ( "f_JT": (
self.job_type_code(scraper_input.job_type) job_type_code(scraper_input.job_type)
if scraper_input.job_type if scraper_input.job_type
else None else None
), ),
@@ -172,7 +174,7 @@ class LinkedInScraper(Scraper):
) -> Optional[JobPost]: ) -> Optional[JobPost]:
salary_tag = job_card.find("span", class_="job-search-card__salary-info") salary_tag = job_card.find("span", class_="job-search-card__salary-info")
compensation = None compensation = description = None
if salary_tag: if salary_tag:
salary_text = salary_tag.get_text(separator=" ").strip() salary_text = salary_tag.get_text(separator=" ").strip()
salary_values = [currency_parser(value) for value in salary_text.split("-")] salary_values = [currency_parser(value) for value in salary_text.split("-")]
@@ -216,6 +218,8 @@ class LinkedInScraper(Scraper):
job_details = {} job_details = {}
if full_descr: if full_descr:
job_details = self._get_job_details(job_id) job_details = self._get_job_details(job_id)
description = job_details.get("description")
is_remote = is_job_remote(title, description, location)
return JobPost( return JobPost(
id=f"li-{job_id}", id=f"li-{job_id}",
@@ -223,6 +227,7 @@ class LinkedInScraper(Scraper):
company_name=company, company_name=company,
company_url=company_url, company_url=company_url,
location=location, location=location,
is_remote=is_remote,
date_posted=date_posted, date_posted=date_posted,
job_url=f"{self.base_url}/jobs/view/{job_id}", job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation, compensation=compensation,
@@ -231,7 +236,7 @@ class LinkedInScraper(Scraper):
company_industry=job_details.get("company_industry"), company_industry=job_details.get("company_industry"),
description=job_details.get("description"), description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"), job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")), emails=extract_emails_from_text(description),
company_logo=job_details.get("company_logo"), company_logo=job_details.get("company_logo"),
job_function=job_details.get("job_function"), job_function=job_details.get("job_function"),
) )
@@ -282,9 +287,9 @@ class LinkedInScraper(Scraper):
) )
return { return {
"description": description, "description": description,
"job_level": self._parse_job_level(soup), "job_level": parse_job_level(soup),
"company_industry": self._parse_company_industry(soup), "company_industry": parse_company_industry(soup),
"job_type": self._parse_job_type(soup), "job_type": parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup), "job_url_direct": self._parse_job_url_direct(soup),
"company_logo": company_logo, "company_logo": company_logo,
"job_function": job_function, "job_function": job_function,
@@ -316,77 +321,6 @@ class LinkedInScraper(Scraper):
location = Location(city=city, state=state, country=country) location = Location(city=city, state=state, country=country)
return location return location
@staticmethod
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None: def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
""" """
Gets the job url direct from job page Gets the job url direct from job page
@@ -403,13 +337,3 @@ class LinkedInScraper(Scraper):
job_url_direct = unquote(job_url_direct_match.group()) job_url_direct = unquote(job_url_direct_match.group())
return job_url_direct return job_url_direct
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")

96
jobspy/linkedin/util.py Normal file
View File

@@ -0,0 +1,96 @@
from bs4 import BeautifulSoup
from jobspy.model import JobType, Location
from jobspy.util import get_enum_from_job_type
def job_type_code(job_type_enum: JobType) -> str:
return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")
def parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
def parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def is_job_remote(title: dict, description: str, location: Location) -> bool:
"""
Searches the title, location, and description to check if job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
location = location.display_location()
full_string = f'{title} {description} {location}'.lower()
is_remote = any(keyword in full_string for keyword in remote_keywords)
return is_remote

View File

@@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Optional from typing import Optional
from datetime import date from datetime import date
from enum import Enum from enum import Enum
@@ -67,17 +68,22 @@ class Country(Enum):
AUSTRALIA = ("australia", "au", "com.au") AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at") AUSTRIA = ("austria", "at", "at")
BAHRAIN = ("bahrain", "bh") BAHRAIN = ("bahrain", "bh")
BANGLADESH = ("bangladesh", "bd") # Added Bangladesh
BELGIUM = ("belgium", "be", "fr:be") BELGIUM = ("belgium", "be", "fr:be")
BULGARIA = ("bulgaria", "bg")
BRAZIL = ("brazil", "br", "com.br") BRAZIL = ("brazil", "br", "com.br")
CANADA = ("canada", "ca", "ca") CANADA = ("canada", "ca", "ca")
CHILE = ("chile", "cl") CHILE = ("chile", "cl")
CHINA = ("china", "cn") CHINA = ("china", "cn")
COLOMBIA = ("colombia", "co") COLOMBIA = ("colombia", "co")
COSTARICA = ("costa rica", "cr") COSTARICA = ("costa rica", "cr")
CROATIA = ("croatia", "hr")
CYPRUS = ("cyprus", "cy")
CZECHREPUBLIC = ("czech republic,czechia", "cz") CZECHREPUBLIC = ("czech republic,czechia", "cz")
DENMARK = ("denmark", "dk") DENMARK = ("denmark", "dk")
ECUADOR = ("ecuador", "ec") ECUADOR = ("ecuador", "ec")
EGYPT = ("egypt", "eg") EGYPT = ("egypt", "eg")
ESTONIA = ("estonia", "ee")
FINLAND = ("finland", "fi") FINLAND = ("finland", "fi")
FRANCE = ("france", "fr", "fr") FRANCE = ("france", "fr", "fr")
GERMANY = ("germany", "de", "de") GERMANY = ("germany", "de", "de")
@@ -91,6 +97,8 @@ class Country(Enum):
ITALY = ("italy", "it", "it") ITALY = ("italy", "it", "it")
JAPAN = ("japan", "jp") JAPAN = ("japan", "jp")
KUWAIT = ("kuwait", "kw") KUWAIT = ("kuwait", "kw")
LATVIA = ("latvia", "lv")
LITHUANIA = ("lithuania", "lt")
LUXEMBOURG = ("luxembourg", "lu") LUXEMBOURG = ("luxembourg", "lu")
MALAYSIA = ("malaysia", "malaysia:my", "com") MALAYSIA = ("malaysia", "malaysia:my", "com")
MALTA = ("malta", "malta:mt", "mt") MALTA = ("malta", "malta:mt", "mt")
@@ -111,6 +119,8 @@ class Country(Enum):
ROMANIA = ("romania", "ro") ROMANIA = ("romania", "ro")
SAUDIARABIA = ("saudi arabia", "sa") SAUDIARABIA = ("saudi arabia", "sa")
SINGAPORE = ("singapore", "sg", "sg") SINGAPORE = ("singapore", "sg", "sg")
SLOVAKIA = ("slovakia", "sk")
SLOVENIA = ("slovenia", "sl")
SOUTHAFRICA = ("south africa", "za") SOUTHAFRICA = ("south africa", "za")
SOUTHKOREA = ("south korea", "kr") SOUTHKOREA = ("south korea", "kr")
SPAIN = ("spain", "es", "es") SPAIN = ("spain", "es", "es")
@@ -245,13 +255,13 @@ class JobPost(BaseModel):
is_remote: bool | None = None is_remote: bool | None = None
listing_type: str | None = None listing_type: str | None = None
# linkedin specific # LinkedIn specific
job_level: str | None = None job_level: str | None = None
# linkedin and indeed specific # LinkedIn and Indeed specific
company_industry: str | None = None company_industry: str | None = None
# indeed specific # Indeed specific
company_addresses: str | None = None company_addresses: str | None = None
company_num_employees: str | None = None company_num_employees: str | None = None
company_revenue: str | None = None company_revenue: str | None = None
@@ -259,9 +269,67 @@ class JobPost(BaseModel):
company_logo: str | None = None company_logo: str | None = None
banner_photo_url: str | None = None banner_photo_url: str | None = None
# linkedin only atm # LinkedIn only atm
job_function: str | None = None job_function: str | None = None
# Naukri specific
skills: list[str] | None = None #from tagsAndSkills
experience_range: str | None = None #from experienceText
company_rating: float | None = None #from ambitionBoxData.AggregateRating
company_reviews_count: int | None = None #from ambitionBoxData.ReviewsCount
vacancy_count: int | None = None #from vacancy
work_from_home_type: str | None = None #from clusters.wfhType (e.g., "Hybrid", "Remote")
class JobResponse(BaseModel): class JobResponse(BaseModel):
jobs: list[JobPost] = [] jobs: list[JobPost] = []
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
BAYT = "bayt"
NAUKRI = "naukri"
BDJOBS = "bdjobs" # Add this line
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None
location: str | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
request_timeout: int = 60
results_wanted: int = 15
hours_old: int | None = None
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
self.user_agent = user_agent
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

301
jobspy/naukri/__init__.py Normal file
View File

@@ -0,0 +1,301 @@
from __future__ import annotations
import math
import random
import time
from datetime import datetime, date, timedelta
from typing import Optional
import regex as re
import requests
from jobspy.exception import NaukriException
from jobspy.naukri.constant import headers as naukri_headers
from jobspy.naukri.util import (
is_job_remote,
parse_job_type,
parse_company_industry,
)
from jobspy.model import (
JobPost,
Location,
JobResponse,
Country,
Compensation,
DescriptionFormat,
Scraper,
ScraperInput,
Site,
)
from jobspy.util import (
extract_emails_from_text,
currency_parser,
markdown_converter,
create_session,
create_logger,
)
log = create_logger("Naukri")
class Naukri(Scraper):
base_url = "https://www.naukri.com/jobapi/v3/search"
delay = 3
band_delay = 4
jobs_per_page = 20
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes NaukriScraper with the Naukri API URL
"""
super().__init__(Site.NAUKRI, proxies=proxies, ca_cert=ca_cert)
self.session = create_session(
proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
clear_cookies=True,
)
self.session.headers.update(naukri_headers)
self.scraper_input = None
self.country = "India" #naukri is india-focused by default
log.info("Naukri scraper initialized")
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Naukri API for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
start = scraper_input.offset or 0
page = (start // self.jobs_per_page) + 1
request_count = 0
seconds_old = (
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
)
continue_search = (
lambda: len(job_list) < scraper_input.results_wanted and page <= 50 # Arbitrary limit
)
while continue_search():
request_count += 1
log.info(
f"Scraping page {request_count} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)} "
f"for search term: {scraper_input.search_term}"
)
params = {
"noOfResults": self.jobs_per_page,
"urlType": "search_by_keyword",
"searchType": "adv",
"keyword": scraper_input.search_term,
"pageNo": page,
"k": scraper_input.search_term,
"seoKey": f"{scraper_input.search_term.lower().replace(' ', '-')}-jobs",
"src": "jobsearchDesk",
"latLong": "",
"location": scraper_input.location,
"remote": "true" if scraper_input.is_remote else None,
}
if seconds_old:
params["days"] = seconds_old // 86400 # Convert to days
params = {k: v for k, v in params.items() if v is not None}
try:
log.debug(f"Sending request to {self.base_url} with params: {params}")
response = self.session.get(self.base_url, params=params, timeout=10)
if response.status_code not in range(200, 400):
err = f"Naukri API response status code {response.status_code} - {response.text}"
log.error(err)
return JobResponse(jobs=job_list)
data = response.json()
job_details = data.get("jobDetails", [])
log.info(f"Received {len(job_details)} job entries from API")
if not job_details:
log.warning("No job details found in API response")
break
except Exception as e:
log.error(f"Naukri API request failed: {str(e)}")
return JobResponse(jobs=job_list)
for job in job_details:
job_id = job.get("jobId")
if not job_id or job_id in seen_ids:
continue
seen_ids.add(job_id)
log.debug(f"Processing job ID: {job_id}")
try:
fetch_desc = scraper_input.linkedin_fetch_description
job_post = self._process_job(job, job_id, fetch_desc)
if job_post:
job_list.append(job_post)
log.info(f"Added job: {job_post.title} (ID: {job_id})")
if not continue_search():
break
except Exception as e:
log.error(f"Error processing job ID {job_id}: {str(e)}")
raise NaukriException(str(e))
if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += 1
job_list = job_list[:scraper_input.results_wanted]
log.info(f"Scraping completed. Total jobs collected: {len(job_list)}")
return JobResponse(jobs=job_list)
def _process_job(
self, job: dict, job_id: str, full_descr: bool
) -> Optional[JobPost]:
"""
Processes a single job from API response into a JobPost object
"""
title = job.get("title", "N/A")
company = job.get("companyName", "N/A")
company_url = f"https://www.naukri.com/{job.get('staticUrl', '')}" if job.get("staticUrl") else None
location = self._get_location(job.get("placeholders", []))
compensation = self._get_compensation(job.get("placeholders", []))
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
description = job.get("jobDescription") if full_descr else None
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
job_type = parse_job_type(description) if description else None
company_industry = parse_company_industry(description) if description else None
is_remote = is_job_remote(title, description or "", location)
company_logo = job.get("logoPathV3") or job.get("logoPath")
# Naukri-specific fields
skills = job.get("tagsAndSkills", "").split(",") if job.get("tagsAndSkills") else None
experience_range = job.get("experienceText")
ambition_box = job.get("ambitionBoxData", {})
company_rating = float(ambition_box.get("AggregateRating")) if ambition_box.get("AggregateRating") else None
company_reviews_count = ambition_box.get("ReviewsCount")
vacancy_count = job.get("vacancy")
work_from_home_type = self._infer_work_from_home_type(job.get("placeholders", []), title, description or "")
job_post = JobPost(
id=f"nk-{job_id}",
title=title,
company_name=company,
company_url=company_url,
location=location,
is_remote=is_remote,
date_posted=date_posted,
job_url=job_url,
compensation=compensation,
job_type=job_type,
company_industry=company_industry,
description=description,
emails=extract_emails_from_text(description or ""),
company_logo=company_logo,
skills=skills,
experience_range=experience_range,
company_rating=company_rating,
company_reviews_count=company_reviews_count,
vacancy_count=vacancy_count,
work_from_home_type=work_from_home_type,
)
log.debug(f"Processed job: {title} at {company}")
return job_post
def _get_location(self, placeholders: list[dict]) -> Location:
"""
Extracts location data from placeholders
"""
location = Location(country=Country.INDIA)
for placeholder in placeholders:
if placeholder.get("type") == "location":
location_str = placeholder.get("label", "")
parts = location_str.split(", ")
city = parts[0] if parts else None
state = parts[1] if len(parts) > 1 else None
location = Location(city=city, state=state, country=Country.INDIA)
log.debug(f"Parsed location: {location.display_location()}")
break
return location
def _get_compensation(self, placeholders: list[dict]) -> Optional[Compensation]:
"""
Extracts compensation data from placeholders, handling Indian salary formats (Lakhs, Crores)
"""
for placeholder in placeholders:
if placeholder.get("type") == "salary":
salary_text = placeholder.get("label", "").strip()
if salary_text == "Not disclosed":
log.debug("Salary not disclosed")
return None
# Handle Indian salary formats (e.g., "12-16 Lacs P.A.", "1-5 Cr")
salary_match = re.match(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*(Lacs|Lakh|Cr)\s*(P\.A\.)?", salary_text, re.IGNORECASE)
if salary_match:
min_salary, max_salary, unit = salary_match.groups()[:3]
min_salary, max_salary = float(min_salary), float(max_salary)
currency = "INR"
# Convert to base units (INR)
if unit.lower() in ("lacs", "lakh"):
min_salary *= 100000 # 1 Lakh = 100,000 INR
max_salary *= 100000
elif unit.lower() == "cr":
min_salary *= 10000000 # 1 Crore = 10,000,000 INR
max_salary *= 10000000
log.debug(f"Parsed salary: {min_salary} - {max_salary} INR")
return Compensation(
min_amount=int(min_salary),
max_amount=int(max_salary),
currency=currency,
)
else:
log.debug(f"Could not parse salary: {salary_text}")
return None
return None
def _parse_date(self, label: str, created_date: int) -> Optional[date]:
"""
Parses date from footerPlaceholderLabel or createdDate, returning a date object
"""
today = datetime.now()
if not label:
if created_date:
return datetime.fromtimestamp(created_date / 1000).date() # Convert to date
return None
label = label.lower()
if "today" in label or "just now" in label or "few hours" in label:
log.debug("Date parsed as today")
return today.date()
elif "ago" in label:
match = re.search(r"(\d+)\s*day", label)
if match:
days = int(match.group(1))
parsed_date = (today - timedelta(days = days)).date()
log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
return parsed_date
elif created_date:
parsed_date = datetime.fromtimestamp(created_date / 1000).date()
log.debug(f"Date parsed from timestamp: {parsed_date}")
return parsed_date
log.debug("No date parsed")
return None
def _infer_work_from_home_type(self, placeholders: list[dict], title: str, description: str) -> Optional[str]:
"""
Infers work-from-home type from job data (e.g., 'Hybrid', 'Remote', 'Work from office')
"""
location_str = next((p["label"] for p in placeholders if p["type"] == "location"), "").lower()
if "hybrid" in location_str or "hybrid" in title.lower() or "hybrid" in description.lower():
return "Hybrid"
elif "remote" in location_str or "remote" in title.lower() or "remote" in description.lower():
return "Remote"
elif "work from office" in description.lower() or not ("remote" in description.lower() or "hybrid" in description.lower()):
return "Work from office"
return None

11
jobspy/naukri/constant.py Normal file
View File

@@ -0,0 +1,11 @@
headers = {
"authority": "www.naukri.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"appid": "109",
"systemid": "Naukri",
"Nkparam": "Ppy0YK9uSHqPtG3bEejYc04RTpUN2CjJOrqA68tzQt0SKJHXZKzz9M8cZtKLVkoOuQmfe4cTb1r2CwfHaxW5Tg==",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

34
jobspy/naukri/util.py Normal file
View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from bs4 import BeautifulSoup
from jobspy.model import JobType, Location
from jobspy.util import get_enum_from_job_type
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from the job page
"""
job_type_tag = soup.find("span", class_="job-type")
if job_type_tag:
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
return [get_enum_from_job_type(job_type_str)] if job_type_str else None
return None
def parse_company_industry(soup: BeautifulSoup) -> str | None:
"""
Gets the company industry from the job page
"""
industry_tag = soup.find("span", class_="industry")
return industry_tag.get_text(strip=True) if industry_tag else None
def is_job_remote(title: str, description: str, location: Location) -> bool:
"""
Searches the title, description, and location to check if the job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
location_str = location.display_location()
full_string = f"{title} {description} {location_str}".lower()
return any(keyword in full_string for keyword in remote_keywords)

View File

@@ -11,7 +11,7 @@ import urllib3
from markdownify import markdownify as md from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType from jobspy.model import CompensationInterval, JobType, Site
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -47,11 +47,12 @@ class RotatingProxySession:
"""Utility method to format a proxy string into a dictionary.""" """Utility method to format a proxy string into a dictionary."""
if proxy.startswith("http://") or proxy.startswith("https://"): if proxy.startswith("http://") or proxy.startswith("https://"):
return {"http": proxy, "https": proxy} return {"http": proxy, "https": proxy}
if proxy.startswith("socks5://"):
return {"http": proxy, "https": proxy}
return {"http": f"http://{proxy}", "https": f"http://{proxy}"} return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
class RequestsRotating(RotatingProxySession, requests.Session): class RequestsRotating(RotatingProxySession, requests.Session):
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False): def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
RotatingProxySession.__init__(self, proxies=proxies) RotatingProxySession.__init__(self, proxies=proxies)
requests.Session.__init__(self) requests.Session.__init__(self)
@@ -86,7 +87,6 @@ class RequestsRotating(RotatingProxySession, requests.Session):
class TLSRotating(RotatingProxySession, tls_client.Session): class TLSRotating(RotatingProxySession, tls_client.Session):
def __init__(self, proxies=None): def __init__(self, proxies=None):
RotatingProxySession.__init__(self, proxies=proxies) RotatingProxySession.__init__(self, proxies=proxies)
tls_client.Session.__init__(self, random_tls_extension_order=True) tls_client.Session.__init__(self, random_tls_extension_order=True)
@@ -286,3 +286,69 @@ def extract_job_type(description: str):
listing_types.append(key) listing_types.append(key)
return listing_types if listing_types else None return listing_types if listing_types else None
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return job_type
raise Exception(f"Invalid job type: {value_str}")
def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080
job_data["max_amount"] *= 2080
if job_data["interval"] == "monthly":
job_data["min_amount"] *= 12
job_data["max_amount"] *= 12
if job_data["interval"] == "weekly":
job_data["min_amount"] *= 52
job_data["max_amount"] *= 52
if job_data["interval"] == "daily":
job_data["min_amount"] *= 260
job_data["max_amount"] *= 260
job_data["interval"] = "yearly"
desired_order = [
"id",
"site",
"job_url",
"job_url_direct",
"title",
"company",
"location",
"date_posted",
"job_type",
"salary_source",
"interval",
"min_amount",
"max_amount",
"currency",
"is_remote",
"job_level",
"job_function",
"listing_type",
"emails",
"description",
"company_industry",
"company_url",
"company_logo",
"company_url_direct",
"company_addresses",
"company_num_employees",
"company_revenue",
"company_description",
# naukri-specific fields
"skills",
"experience_range",
"company_rating",
"company_reviews_count",
"vacancy_count",
"work_from_home_type",
]

View File

@@ -1,10 +1,3 @@
"""
jobspy.scrapers.ziprecruiter
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape ZipRecruiter.
"""
from __future__ import annotations from __future__ import annotations
import json import json
@@ -13,38 +6,39 @@ import re
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime from datetime import datetime
from typing import Optional, Tuple, Any
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .constants import headers from jobspy.ziprecruiter.constant import headers, get_cookie_data
from .. import Scraper, ScraperInput, Site from jobspy.util import (
from ..utils import (
extract_emails_from_text, extract_emails_from_text,
create_session, create_session,
markdown_converter, markdown_converter,
remove_attributes, remove_attributes,
create_logger, create_logger,
) )
from ...jobs import ( from jobspy.model import (
JobPost, JobPost,
Compensation, Compensation,
Location, Location,
JobResponse, JobResponse,
JobType,
Country, Country,
DescriptionFormat, DescriptionFormat,
Scraper,
ScraperInput,
Site,
) )
from jobspy.ziprecruiter.util import get_job_type_enum, add_params
log = create_logger("ZipRecruiter") log = create_logger("ZipRecruiter")
class ZipRecruiterScraper(Scraper): class ZipRecruiter(Scraper):
base_url = "https://www.ziprecruiter.com" base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com" api_url = "https://api.ziprecruiter.com"
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url
@@ -90,7 +84,7 @@ class ZipRecruiterScraper(Scraper):
def _find_jobs_in_page( def _find_jobs_in_page(
self, scraper_input: ScraperInput, continue_token: str | None = None self, scraper_input: ScraperInput, continue_token: str | None = None
) -> Tuple[list[JobPost], Optional[str]]: ) -> tuple[list[JobPost], str | None]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
@@ -98,7 +92,7 @@ class ZipRecruiterScraper(Scraper):
:return: jobs found on page :return: jobs found on page
""" """
jobs_list = [] jobs_list = []
params = self._add_params(scraper_input) params = add_params(scraper_input)
if continue_token: if continue_token:
params["continue_from"] = continue_token params["continue_from"] = continue_token
try: try:
@@ -151,7 +145,7 @@ class ZipRecruiterScraper(Scraper):
location = Location( location = Location(
city=job.get("job_city"), state=job.get("job_state"), country=country_enum city=job.get("job_city"), state=job.get("job_state"), country=country_enum
) )
job_type = self._get_job_type_enum( job_type = get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower() job.get("employment_type", "").replace("_", "").lower()
) )
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date() date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
@@ -200,13 +194,17 @@ class ZipRecruiterScraper(Scraper):
else "" else ""
) )
description_full = job_description_clean + company_description_clean description_full = job_description_clean + company_description_clean
script_tag = soup.find("script", type="application/json")
if script_tag: try:
job_json = json.loads(script_tag.string) script_tag = soup.find("script", type="application/json")
job_url_val = job_json["model"].get("saveJobURL", "") if script_tag:
m = re.search(r"job_url=(.+)", job_url_val) job_json = json.loads(script_tag.string)
if m: job_url_val = job_json["model"].get("saveJobURL", "")
job_url_direct = m.group(1) m = re.search(r"job_url=(.+)", job_url_val)
if m:
job_url_direct = m.group(1)
except:
job_url_direct = None
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description_full = markdown_converter(description_full) description_full = markdown_converter(description_full)
@@ -217,51 +215,5 @@ class ZipRecruiterScraper(Scraper):
""" """
Sends a session event to the API with device properties. Sends a session event to the API with device properties.
""" """
data = [
("event_type", "session"),
("logged_in", "false"),
("number_of_retry", "1"),
("property", "model:iPhone"),
("property", "os:iOS"),
("property", "locale:en_us"),
("property", "app_build_number:4734"),
("property", "app_version:91.0"),
("property", "manufacturer:Apple"),
("property", "timestamp:2025-01-12T12:04:42-06:00"),
("property", "screen_height:852"),
("property", "os_version:16.6.1"),
("property", "source:install"),
("property", "screen_width:393"),
("property", "device_model:iPhone 14 Pro"),
("property", "brand:Apple"),
]
url = f"{self.api_url}/jobs-app/event" url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data) self.session.post(url, data=get_cookie_data)
@staticmethod
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def _add_params(scraper_input) -> dict[str, str | Any]:
params = {
"search": scraper_input.search_term,
"location": scraper_input.location,
}
if scraper_input.hours_old:
params["days"] = max(scraper_input.hours_old // 24, 1)
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
if scraper_input.job_type:
job_type = scraper_input.job_type
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply:
params["zipapply"] = 1
if scraper_input.is_remote:
params["remote"] = 1
if scraper_input.distance:
params["radius"] = scraper_input.distance
return {k: v for k, v in params.items() if v is not None}

View File

@@ -0,0 +1,29 @@
headers = {
"Host": "api.ziprecruiter.com",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
}
get_cookie_data = [
("event_type", "session"),
("logged_in", "false"),
("number_of_retry", "1"),
("property", "model:iPhone"),
("property", "os:iOS"),
("property", "locale:en_us"),
("property", "app_build_number:4734"),
("property", "app_version:91.0"),
("property", "manufacturer:Apple"),
("property", "timestamp:2025-01-12T12:04:42-06:00"),
("property", "screen_height:852"),
("property", "os_version:16.6.1"),
("property", "source:install"),
("property", "screen_width:393"),
("property", "device_model:iPhone 14 Pro"),
("property", "brand:Apple"),
]

View File

@@ -0,0 +1,31 @@
from jobspy.model import JobType
def add_params(scraper_input) -> dict[str, str | int]:
params: dict[str, str | int] = {
"search": scraper_input.search_term,
"location": scraper_input.location,
}
if scraper_input.hours_old:
params["days"] = max(scraper_input.hours_old // 24, 1)
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
if scraper_input.job_type:
job_type = scraper_input.job_type
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply:
params["zipapply"] = 1
if scraper_input.is_remote:
params["remote"] = 1
if scraper_input.distance:
params["radius"] = scraper_input.distance
return {k: v for k, v in params.items() if v is not None}
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None

View File

@@ -4,15 +4,14 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.76" version = "1.1.82"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
authors = [ "Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>",] authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/cullenwatson/JobSpy"
readme = "README.md" readme = "README.md"
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",] keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt", "naukri"]
[[tool.poetry.packages]] [[tool.poetry.packages]]
include = "jobspy" include = "jobspy"
from = "src"
[tool.black] [tool.black]
line-length = 88 line-length = 88
@@ -29,7 +28,6 @@ markdownify = "^0.13.1"
regex = "^2024.4.28" regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0" jupyter = "^1.0.0"
black = "*" black = "*"
pre-commit = "*" pre-commit = "*"

View File

@@ -1,58 +0,0 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat,
)
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
BAYT = "bayt"
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None
location: str | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15
hours_old: int | None = None
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@@ -1,10 +0,0 @@
headers = {
"Host": "api.ziprecruiter.com",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
}