Compare commits

..

13 Commits

Author SHA1 Message Date
Berkay Gemici
fda080a373 fix(linkedin): add fallback for date parsing on new job listings (#343)
LinkedIn uses two CSS classes for job posting dates:
- `job-search-card__listdate` for older posts
- `job-search-card__listdate--new` for recent posts (< 24h)

The scraper only checked the first class, causing `date_posted` to be
None for all fresh listings. This adds a fallback to also check for
the `--new` variant.
2026-02-18 13:39:52 -06:00
Sean
6e7ab6ff74 Fix: re Issue #295 (@krishianjan): added (seemingly missing) user_agent keyword argument to BDJobs 2026-01-09 23:28:27 -06:00
kj55-dev
7160d0faed fix: relax numpy version constraint to >=1.26.0 (#337) 2026-01-09 23:27:54 -06:00
Cullen Watson
6e014cf732 chore: codeowners 2025-08-23 22:42:45 +02:00
Kaushik H S
6e8576f8a8 fix(naukri): prevent str.find error by normalizing input and parsing before Markdown (#300) 2025-08-23 15:38:26 -05:00
Alexander Smirnov
51888004b7 Update __init__.py (#296)
pagination fix: start update with job_cards instead of job_list
2025-08-23 15:38:02 -05:00
Lixian Wang
b6d5cd8d79 fix:correct LinkedIn logger naming (#291)
* fix:correct LinkedIn logger naming

* add:linkedin description plain format
2025-08-23 15:37:49 -05:00
ZuoyunZheng
84ed670df3 chore: bump markdownify from 0.13.1 to 1.1.0 (#290) 2025-08-23 15:37:34 -05:00
Cullen Watson
4b16ac7967 chore:readme 2025-07-28 17:19:56 +02:00
itsShrizon
ae2b1ea42c Bdjobs Fixed (#280) 2025-07-28 10:05:10 -05:00
Cullen Watson
53b3b41385 fix: glassdoor ua 2025-07-28 16:55:51 +02:00
Lê Trọng Tài
9aae02453d issue#270: glassdoor 403 response by rotating user-agent and updating headers (#274) 2025-07-28 09:55:05 -05:00
Piotr Geca
94d413bad1 support for socks5 proxies (#266)
Co-authored-by: Piotr Geca <piotr.geca@npl.co.uk>
2025-04-10 15:53:28 -05:00
20 changed files with 569 additions and 82 deletions

1
.github/CODEOWNERS vendored Normal file
View File

@@ -0,0 +1 @@
* @cullenwatson

View File

@@ -4,7 +4,7 @@
## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently
- Aggregates the job postings in a dataframe
- Proxies support to bypass blocking
@@ -25,7 +25,7 @@ import csv
from jobspy import scrape_jobs
jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
search_term="software engineer",
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA",
@@ -59,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext
Optional
├── site_name (list|str):
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs
| (default is all)
├── search_term (str)
@@ -86,6 +86,9 @@ Optional
├── easy_apply (bool):
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
├── user_agent (str):
| override the default user agent which may be outdated
├── description_format (str):
| markdown, html (Format type of the job descriptions. Default is markdown.)

View File

@@ -6,6 +6,7 @@ from typing import Tuple
import pandas as pd
from jobspy.bayt import BaytScraper
from jobspy.bdjobs import BDJobs
from jobspy.glassdoor import Glassdoor
from jobspy.google import Google
from jobspy.indeed import Indeed
@@ -25,6 +26,8 @@ from jobspy.util import (
from jobspy.ziprecruiter import ZipRecruiter
# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
@@ -45,6 +48,7 @@ def scrape_jobs(
hours_old: int = None,
enforce_annual_salary: bool = False,
verbose: int = 0,
user_agent: str = None,
**kwargs,
) -> pd.DataFrame:
"""
@@ -59,6 +63,7 @@ def scrape_jobs(
Site.GOOGLE: Google,
Site.BAYT: BaytScraper,
Site.NAUKRI: Naukri,
Site.BDJOBS: BDJobs, # Add BDJobs to the scraper mapping
}
set_logger_level(verbose)
job_type = get_enum_from_value(job_type) if job_type else None
@@ -98,10 +103,11 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name
create_logger(site_name).info(f"finished scraping")
return site.value, scraped_data
@@ -212,4 +218,10 @@ def scrape_jobs(
by=["site", "date_posted"], ascending=[True, False]
).reset_index(drop=True)
else:
return pd.DataFrame()
return pd.DataFrame()
# Add BDJobs to __all__
__all__ = [
"BDJobs",
]

View File

@@ -25,7 +25,7 @@ class BaytScraper(Scraper):
band_delay = 3
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None

353
jobspy/bdjobs/__init__.py Normal file
View File

@@ -0,0 +1,353 @@
# __init__.py
from __future__ import annotations
import random
import time
from datetime import datetime
from typing import Optional, List, Dict, Any
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from bs4.element import Tag
from jobspy.exception import BDJobsException
from jobspy.bdjobs.constant import headers, search_params
from jobspy.bdjobs.util import (
parse_location,
parse_date,
find_job_listings,
is_job_remote,
)
from jobspy.model import (
JobPost,
Location,
JobResponse,
Country,
Scraper,
ScraperInput,
Site,
DescriptionFormat,
)
from jobspy.util import (
extract_emails_from_text,
create_session,
create_logger,
remove_attributes,
markdown_converter,
)
log = create_logger("BDJobs")
class BDJobs(Scraper):
base_url = "https://jobs.bdjobs.com"
search_url = "https://jobs.bdjobs.com/jobsearch.asp"
delay = 2
band_delay = 3
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes BDJobsScraper with the BDJobs job search url
"""
super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
self.session = create_session(
proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
clear_cookies=True,
)
self.session.headers.update(headers)
self.scraper_input = None
self.country = "bangladesh"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes BDJobs for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
page = 1
request_count = 0
# Set up search parameters
params = search_params.copy()
params["txtsearch"] = scraper_input.search_term
continue_search = lambda: len(job_list) < scraper_input.results_wanted
while continue_search():
request_count += 1
log.info(f"search page: {request_count}")
try:
# Add page parameter if needed
if page > 1:
params["pg"] = page
response = self.session.get(
self.search_url,
params=params,
timeout=getattr(scraper_input, "request_timeout", 60),
)
if response.status_code != 200:
log.error(f"BDJobs response status code {response.status_code}")
break
soup = BeautifulSoup(response.text, "html.parser")
job_cards = find_job_listings(soup)
if not job_cards or len(job_cards) == 0:
log.info("No more job listings found")
break
log.info(f"Found {len(job_cards)} job cards on page {page}")
for job_card in job_cards:
try:
job_post = self._process_job(job_card)
if job_post and job_post.id not in seen_ids:
seen_ids.add(job_post.id)
job_list.append(job_post)
if not continue_search():
break
except Exception as e:
log.error(f"Error processing job card: {str(e)}")
page += 1
# Add delay between requests
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
except Exception as e:
log.error(f"Error during scraping: {str(e)}")
break
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
"""
Processes a job card element into a JobPost object
:param job_card: Job card element
:return: JobPost object
"""
try:
# Extract job ID and URL
job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
if not job_link:
return None
job_url = job_link.get("href")
if not job_url.startswith("http"):
job_url = urljoin(self.base_url, job_url)
# Extract job ID from URL
job_id = (
job_url.split("jobid=")[-1].split("&")[0]
if "jobid=" in job_url
else f"bdjobs-{hash(job_url)}"
)
# Extract title
title = job_link.get_text(strip=True)
if not title:
title_elem = job_card.find(
["h2", "h3", "h4", "strong", "div"],
class_=lambda c: c and "job-title-text" in c,
)
title = title_elem.get_text(strip=True) if title_elem else "N/A"
# Extract company name - IMPROVED
company_elem = job_card.find(
["span", "div"],
class_=lambda c: c and "comp-name-text" in (c or "").lower(),
)
if company_elem:
company_name = company_elem.get_text(strip=True)
else:
# Try alternative selectors
company_elem = job_card.find(
["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["company", "org", "comp-name"]
),
)
company_name = (
company_elem.get_text(strip=True) if company_elem else "N/A"
)
# Extract location
location_elem = job_card.find(
["span", "div"],
class_=lambda c: c and "locon-text-d" in (c or "").lower(),
)
if not location_elem:
location_elem = job_card.find(
["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["location", "area", "locon"]
),
)
location_text = (
location_elem.get_text(strip=True)
if location_elem
else "Dhaka, Bangladesh"
)
# Create Location object
location = parse_location(location_text, self.country)
# Extract date posted
date_elem = job_card.find(
["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["date", "deadline", "published"]
),
)
date_posted = None
if date_elem:
date_text = date_elem.get_text(strip=True)
date_posted = parse_date(date_text)
# Check if job is remote
is_remote = is_job_remote(title, location=location)
# Create job post object
job_post = JobPost(
id=job_id,
title=title,
company_name=company_name, # Use company_name instead of company
location=location,
date_posted=date_posted,
job_url=job_url,
is_remote=is_remote,
site=self.site,
)
# Always fetch description for BDJobs
job_details = self._get_job_details(job_url)
job_post.description = job_details.get("description", "")
job_post.job_type = job_details.get("job_type", "")
return job_post
except Exception as e:
log.error(f"Error in _process_job: {str(e)}")
return None
def _get_job_details(self, job_url: str) -> Dict[str, Any]:
"""
Gets detailed job information from the job page
:param job_url: Job page URL
:return: Dictionary with job details
"""
try:
response = self.session.get(job_url, timeout=60)
if response.status_code != 200:
return {}
soup = BeautifulSoup(response.text, "html.parser")
# Find job description - IMPROVED based on correct.py
description = ""
# Try to find the job content div first (as in correct.py)
job_content_div = soup.find("div", class_="jobcontent")
if job_content_div:
# Look for responsibilities section
responsibilities_heading = job_content_div.find(
"h4", id="job_resp"
) or job_content_div.find(
["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower()
)
if responsibilities_heading:
responsibilities_elements = []
# Find all following elements until the next heading or hr
for sibling in responsibilities_heading.find_next_siblings():
if sibling.name in ["hr", "h4", "h5"]:
break
if sibling.name == "ul":
responsibilities_elements.extend(
li.get_text(separator=" ", strip=True)
for li in sibling.find_all("li")
)
elif sibling.name == "p":
responsibilities_elements.append(
sibling.get_text(separator=" ", strip=True)
)
description = (
"\n".join(responsibilities_elements)
if responsibilities_elements
else ""
)
# If no description found yet, try the original approach
if not description:
description_elem = soup.find(
["div", "section"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["job-description", "details", "requirements"]
),
)
if description_elem:
description_elem = remove_attributes(description_elem)
description = description_elem.prettify(formatter="html")
if (
hasattr(self.scraper_input, "description_format")
and self.scraper_input.description_format
== DescriptionFormat.MARKDOWN
):
description = markdown_converter(description)
# Extract job type
job_type_elem = soup.find(
["span", "div"],
string=lambda s: s
and any(
term in (s or "").lower()
for term in ["job type", "employment type"]
),
)
job_type = None
if job_type_elem:
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(
strip=True
)
job_type = job_type_text if job_type_text else None
# Extract company industry
industry_elem = soup.find(
["span", "div"], string=lambda s: s and "industry" in (s or "").lower()
)
company_industry = None
if industry_elem:
industry_text = industry_elem.find_next(["span", "div"]).get_text(
strip=True
)
company_industry = industry_text if industry_text else None
return {
"description": description,
"job_type": job_type,
"company_industry": company_industry,
}
except Exception as e:
log.error(f"Error getting job details: {str(e)}")
return {}

32
jobspy/bdjobs/constant.py Normal file
View File

@@ -0,0 +1,32 @@
#constant.py
# Headers for BDJobs requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Referer": "https://jobs.bdjobs.com/",
"Cache-Control": "max-age=0",
}
# Search parameters that work best for BDJobs
search_params = {
"hidJobSearch": "jobsearch",
}
# Selectors for job listings
job_selectors = [
"div.job-item", # Catches both normal and premium job cards, as well as other types
"div.sout-jobs-wrapper", # Catches job listings in the main search results page
"div.norm-jobs-wrapper", # Catches normal job listings
"div.featured-wrap", # Catches featured job listings
]
# Date formats used by BDJobs
date_formats = [
"%d %b %Y",
"%d-%b-%Y",
"%d %B %Y",
"%B %d, %Y",
"%d/%m/%Y",
]

100
jobspy/bdjobs/util.py Normal file
View File

@@ -0,0 +1,100 @@
#util.py
from bs4 import BeautifulSoup
from datetime import datetime
from typing import Optional, List, Dict, Any
from jobspy.model import Location, Country
def parse_location(location_text: str, country: str = "bangladesh") -> Location:
"""
Parses location text into a Location object
:param location_text: Location text from job listing
:param country: Default country
:return: Location object
"""
parts = location_text.split(",")
if len(parts) >= 2:
city = parts[0].strip()
state = parts[1].strip()
return Location(
city=city,
state=state,
country=Country.from_string(country)
)
else:
return Location(
city=location_text.strip(),
country=Country.from_string(country)
)
def parse_date(date_text: str) -> Optional[datetime]:
"""
Parses date text into a datetime object
:param date_text: Date text from job listing
:return: datetime object or None if parsing fails
"""
from .constant import date_formats
try:
# Clean up date text
if "Deadline:" in date_text:
date_text = date_text.replace("Deadline:", "").strip()
# Try different date formats
for fmt in date_formats:
try:
return datetime.strptime(date_text, fmt)
except ValueError:
continue
return None
except Exception:
return None
def find_job_listings(soup: BeautifulSoup) -> List[Any]:
"""
Finds job listing elements in the HTML
:param soup: BeautifulSoup object
:return: List of job card elements
"""
from .constant import job_selectors
# Try different selectors
for selector in job_selectors:
if "." in selector:
tag_name, class_name = selector.split(".", 1)
elements = soup.find_all(tag_name, class_=class_name)
if elements and len(elements) > 0:
return elements
# If no selectors match, look for job detail links
job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower())
if job_links:
# Return parent elements of job links
return [link.parent for link in job_links]
return []
def is_job_remote(title: str, description: str = None, location: Location = None) -> bool:
"""
Determines if a job is remote based on title, description, and location
:param title: Job title
:param description: Job description
:param location: Job location
:return: True if job is remote, False otherwise
"""
remote_keywords = ["remote", "work from home", "wfh", "home based"]
# Combine all text fields
full_text = title.lower()
if description:
full_text += " " + description.lower()
if location:
full_text += " " + location.display_location().lower()
# Check for remote keywords
return any(keyword in full_text for keyword in remote_keywords)

View File

@@ -37,4 +37,9 @@ class BaytException(Exception):
class NaukriException(Exception):
def __init__(self,message=None):
super().__init__(message or "An error occurred with Naukri")
super().__init__(message or "An error occurred with Naukri")
class BDJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with BDJobs")

View File

@@ -34,13 +34,13 @@ log = create_logger("Glassdoor")
class Glassdoor(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
"""
site = Site(Site.GLASSDOOR)
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
super().__init__(site, proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
self.base_url = None
self.country = None
@@ -65,6 +65,8 @@ class Glassdoor(Scraper):
)
token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token
if self.user_agent:
headers["user-agent"] = self.user_agent
self.session.headers.update(headers)
location_id, location_type = self._get_location(

View File

@@ -13,7 +13,7 @@ headers = {
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
}
query_template = """
query JobSearchResultsQuery(

View File

@@ -22,7 +22,7 @@ from jobspy.google.util import log, find_job_info_initial_page, find_job_info
class Google(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes Google Scraper with the Goodle jobs search url

View File

@@ -28,7 +28,7 @@ log = create_logger("Indeed")
class Indeed(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes IndeedScraper with the Indeed API url

View File

@@ -35,6 +35,7 @@ from jobspy.util import (
extract_emails_from_text,
currency_parser,
markdown_converter,
plain_converter,
create_session,
remove_attributes,
create_logger,
@@ -50,7 +51,7 @@ class LinkedIn(Scraper):
jobs_per_page = 25
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes LinkedInScraper with the LinkedIn job search url
@@ -164,7 +165,7 @@ class LinkedIn(Scraper):
if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
start += len(job_list)
start += len(job_cards)
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
@@ -208,6 +209,10 @@ class LinkedIn(Scraper):
if metadata_card
else None
)
if not datetime_tag and metadata_card:
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate--new"
)
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
@@ -267,7 +272,8 @@ class LinkedIn(Scraper):
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
elif self.scraper_input.description_format == DescriptionFormat.PLAIN:
description = plain_converter(description)
h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip()
)

View File

@@ -68,6 +68,7 @@ class Country(Enum):
AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at")
BAHRAIN = ("bahrain", "bh")
BANGLADESH = ("bangladesh", "bd") # Added Bangladesh
BELGIUM = ("belgium", "be", "fr:be")
BULGARIA = ("bulgaria", "bg")
BRAZIL = ("brazil", "br", "com.br")
@@ -233,7 +234,7 @@ class Compensation(BaseModel):
class DescriptionFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"
PLAIN = "plain"
class JobPost(BaseModel):
id: str | None = None
@@ -291,6 +292,7 @@ class Site(Enum):
GOOGLE = "google"
BAYT = "bayt"
NAUKRI = "naukri"
BDJOBS = "bdjobs" # Add this line
class SalarySource(Enum):
@@ -314,17 +316,20 @@ class ScraperInput(BaseModel):
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
request_timeout: int = 60
results_wanted: int = 15
hours_old: int | None = None
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
self.user_agent = user_agent
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@@ -44,7 +44,7 @@ class Naukri(Scraper):
jobs_per_page = 20
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes NaukriScraper with the Naukri API URL
@@ -164,12 +164,15 @@ class Naukri(Scraper):
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
description = job.get("jobDescription") if full_descr else None
raw_description = job.get("jobDescription") if full_descr else None
job_type = parse_job_type(raw_description) if raw_description else None
company_industry = parse_company_industry(raw_description) if raw_description else None
description = raw_description
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
job_type = parse_job_type(description) if description else None
company_industry = parse_company_industry(description) if description else None
is_remote = is_job_remote(title, description or "", location)
company_logo = job.get("logoPathV3") or job.get("logoPath")

View File

@@ -5,10 +5,12 @@ from jobspy.model import JobType, Location
from jobspy.util import get_enum_from_job_type
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
def parse_job_type(soup: BeautifulSoup |str) -> list[JobType] | None:
"""
Gets the job type from the job page
"""
if isinstance(soup, str):
soup = BeautifulSoup(soup, "html.parser")
job_type_tag = soup.find("span", class_="job-type")
if job_type_tag:
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
@@ -16,10 +18,12 @@ def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
return None
def parse_company_industry(soup: BeautifulSoup) -> str | None:
def parse_company_industry(soup: BeautifulSoup | str) -> str | None:
"""
Gets the company industry from the job page
"""
if isinstance(soup, str):
soup = BeautifulSoup(soup, "html.parser")
industry_tag = soup.find("span", class_="industry")
return industry_tag.get_text(strip=True) if industry_tag else None

View File

@@ -47,11 +47,12 @@ class RotatingProxySession:
"""Utility method to format a proxy string into a dictionary."""
if proxy.startswith("http://") or proxy.startswith("https://"):
return {"http": proxy, "https": proxy}
if proxy.startswith("socks5://"):
return {"http": proxy, "https": proxy}
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
class RequestsRotating(RotatingProxySession, requests.Session):
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
RotatingProxySession.__init__(self, proxies=proxies)
requests.Session.__init__(self)
@@ -86,7 +87,6 @@ class RequestsRotating(RotatingProxySession, requests.Session):
class TLSRotating(RotatingProxySession, tls_client.Session):
def __init__(self, proxies=None):
RotatingProxySession.__init__(self, proxies=proxies)
tls_client.Session.__init__(self, random_tls_extension_order=True)
@@ -157,6 +157,15 @@ def markdown_converter(description_html: str):
markdown = md(description_html)
return markdown.strip()
def plain_converter(decription_html:str):
from bs4 import BeautifulSoup
if decription_html is None:
return None
soup = BeautifulSoup(decription_html, "html.parser")
text = soup.get_text(separator=" ")
text = re.sub(r'\s+',' ',text)
return text.strip()
def extract_emails_from_text(text: str) -> list[str] | None:
if not text:
@@ -344,7 +353,7 @@ desired_order = [
"company_num_employees",
"company_revenue",
"company_description",
#naukri-specific fields
# naukri-specific fields
"skills",
"experience_range",
"company_rating",

View File

@@ -38,7 +38,7 @@ class ZipRecruiter(Scraper):
api_url = "https://api.ziprecruiter.com"
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
):
"""
Initializes ZipRecruiterScraper with the ZipRecruiter job search url

56
poetry.lock generated
View File

@@ -749,17 +749,6 @@ files = [
[package.extras]
all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
[[package]]
name = "iniconfig"
version = "2.0.0"
description = "brain-dead simple config-ini parsing"
optional = false
python-versions = ">=3.7"
files = [
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
]
[[package]]
name = "ipykernel"
version = "6.29.5"
@@ -1229,13 +1218,13 @@ files = [
[[package]]
name = "markdownify"
version = "0.13.1"
version = "1.1.0"
description = "Convert HTML to markdown."
optional = false
python-versions = "*"
files = [
{file = "markdownify-0.13.1-py3-none-any.whl", hash = "sha256:1d181d43d20902bcc69d7be85b5316ed174d0dda72ff56e14ae4c95a4a407d22"},
{file = "markdownify-0.13.1.tar.gz", hash = "sha256:ab257f9e6bd4075118828a28c9d02f8a4bfeb7421f558834aa79b2dfeb32a098"},
{file = "markdownify-1.1.0-py3-none-any.whl", hash = "sha256:32a5a08e9af02c8a6528942224c91b933b4bd2c7d078f9012943776fc313eeef"},
{file = "markdownify-1.1.0.tar.gz", hash = "sha256:449c0bbbf1401c5112379619524f33b63490a8fa479456d41de9dc9e37560ebd"},
]
[package.dependencies]
@@ -1710,21 +1699,6 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
type = ["mypy (>=1.11.2)"]
[[package]]
name = "pluggy"
version = "1.5.0"
description = "plugin and hook calling mechanisms for python"
optional = false
python-versions = ">=3.8"
files = [
{file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
{file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
]
[package.extras]
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pre-commit"
version = "4.0.1"
@@ -1975,28 +1949,6 @@ files = [
[package.extras]
windows-terminal = ["colorama (>=0.4.6)"]
[[package]]
name = "pytest"
version = "7.4.4"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
{file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
]
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@@ -2869,4 +2821,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "57169347d2ce0ff19c4d3024ce000651bb3a816e36f454618f480741094fb4a7"
content-hash = "6260adc8f96f6cf1ba4e2c23f05504c19e67140b9d346aed3d12eea6957b2104"

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "python-jobspy"
version = "1.1.80"
version = "1.1.82"
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
homepage = "https://github.com/cullenwatson/JobSpy"
@@ -21,10 +21,10 @@ python = "^3.10"
requests = "^2.31.0"
beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.26.3"
numpy = ">=1.26.0"
pydantic = "^2.3.0"
tls-client = "^1.0.1"
markdownify = "^0.13.1"
markdownify = "^1.1.0"
regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies]