mirror of https://github.com/Bunsly/JobSpy
Bdjobs Fixed
parent
94d413bad1
commit
cb6ea53b7c
|
@ -6,6 +6,7 @@ from typing import Tuple
|
|||
import pandas as pd
|
||||
|
||||
from jobspy.bayt import BaytScraper
|
||||
from jobspy.bdjobs import BDJobs
|
||||
from jobspy.glassdoor import Glassdoor
|
||||
from jobspy.google import Google
|
||||
from jobspy.indeed import Indeed
|
||||
|
@ -25,6 +26,8 @@ from jobspy.util import (
|
|||
from jobspy.ziprecruiter import ZipRecruiter
|
||||
|
||||
|
||||
# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
|
||||
|
||||
def scrape_jobs(
|
||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||
search_term: str | None = None,
|
||||
|
@ -59,6 +62,7 @@ def scrape_jobs(
|
|||
Site.GOOGLE: Google,
|
||||
Site.BAYT: BaytScraper,
|
||||
Site.NAUKRI: Naukri,
|
||||
Site.BDJOBS: BDJobs, # Add BDJobs to the scraper mapping
|
||||
}
|
||||
set_logger_level(verbose)
|
||||
job_type = get_enum_from_value(job_type) if job_type else None
|
||||
|
@ -213,3 +217,9 @@ def scrape_jobs(
|
|||
).reset_index(drop=True)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
# Add BDJobs to __all__
|
||||
__all__ = [
|
||||
"BDJobs",
|
||||
]
|
|
@ -0,0 +1,279 @@
|
|||
#__init__.py
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
|
||||
from jobspy.exception import BDJobsException
|
||||
from jobspy.bdjobs.constant import headers, search_params
|
||||
from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
Country,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
DescriptionFormat,
|
||||
)
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
create_logger,
|
||||
remove_attributes,
|
||||
markdown_converter,
|
||||
)
|
||||
|
||||
log = create_logger("BDJobs")
|
||||
|
||||
|
||||
class BDJobs(Scraper):
|
||||
base_url = "https://jobs.bdjobs.com"
|
||||
search_url = "https://jobs.bdjobs.com/jobsearch.asp"
|
||||
delay = 2
|
||||
band_delay = 3
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes BDJobsScraper with the BDJobs job search url
|
||||
"""
|
||||
super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
|
||||
self.session = create_session(
|
||||
proxies=self.proxies,
|
||||
ca_cert=ca_cert,
|
||||
is_tls=False,
|
||||
has_retry=True,
|
||||
delay=5,
|
||||
clear_cookies=True,
|
||||
)
|
||||
self.session.headers.update(headers)
|
||||
self.scraper_input = None
|
||||
self.country = "bangladesh"
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes BDJobs for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
:return: job_response
|
||||
"""
|
||||
self.scraper_input = scraper_input
|
||||
job_list: list[JobPost] = []
|
||||
seen_ids = set()
|
||||
page = 1
|
||||
request_count = 0
|
||||
|
||||
# Set up search parameters
|
||||
params = search_params.copy()
|
||||
params["txtsearch"] = scraper_input.search_term
|
||||
|
||||
continue_search = lambda: len(job_list) < scraper_input.results_wanted
|
||||
|
||||
while continue_search():
|
||||
request_count += 1
|
||||
log.info(f"search page: {request_count}")
|
||||
|
||||
try:
|
||||
# Add page parameter if needed
|
||||
if page > 1:
|
||||
params["pg"] = page
|
||||
|
||||
response = self.session.get(
|
||||
self.search_url,
|
||||
params=params,
|
||||
timeout=getattr(scraper_input, 'request_timeout', 60)
|
||||
)
|
||||
|
||||
# DEBUG: Save the received HTML content
|
||||
try:
|
||||
with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f:
|
||||
f.write(response.text)
|
||||
log.info(f"Saved scraper response to scraper_received_bdjobs.html")
|
||||
except Exception as e_write:
|
||||
log.error(f"Error writing debug HTML file: {e_write}")
|
||||
|
||||
if response.status_code != 200:
|
||||
log.error(f"BDJobs response status code {response.status_code}")
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
job_cards = find_job_listings(soup)
|
||||
|
||||
if not job_cards or len(job_cards) == 0:
|
||||
log.info("No more job listings found")
|
||||
break
|
||||
|
||||
log.info(f"Found {len(job_cards)} job cards on page {page}")
|
||||
|
||||
for job_card in job_cards:
|
||||
try:
|
||||
job_post = self._process_job(job_card)
|
||||
if job_post and job_post.id not in seen_ids:
|
||||
seen_ids.add(job_post.id)
|
||||
job_list.append(job_post)
|
||||
|
||||
if not continue_search():
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Error processing job card: {str(e)}")
|
||||
|
||||
page += 1
|
||||
# Add delay between requests
|
||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error during scraping: {str(e)}")
|
||||
break
|
||||
|
||||
job_list = job_list[:scraper_input.results_wanted]
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
|
||||
"""
|
||||
Processes a job card element into a JobPost object
|
||||
:param job_card: Job card element
|
||||
:return: JobPost object
|
||||
"""
|
||||
try:
|
||||
# Extract job ID and URL
|
||||
job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
|
||||
if not job_link:
|
||||
return None
|
||||
|
||||
job_url = job_link.get("href")
|
||||
if not job_url.startswith("http"):
|
||||
job_url = urljoin(self.base_url, job_url)
|
||||
|
||||
# Extract job ID from URL
|
||||
job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}"
|
||||
|
||||
# Extract title
|
||||
title = job_link.get_text(strip=True)
|
||||
if not title:
|
||||
title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c)
|
||||
title = title_elem.get_text(strip=True) if title_elem else "N/A"
|
||||
|
||||
# Extract company name - IMPROVED
|
||||
company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower())
|
||||
if company_elem:
|
||||
company_name = company_elem.get_text(strip=True)
|
||||
else:
|
||||
# Try alternative selectors
|
||||
company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"]))
|
||||
company_name = company_elem.get_text(strip=True) if company_elem else "N/A"
|
||||
|
||||
# Extract location
|
||||
location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower())
|
||||
if not location_elem:
|
||||
location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"]))
|
||||
location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh"
|
||||
|
||||
# Create Location object
|
||||
location = parse_location(location_text, self.country)
|
||||
|
||||
# Extract date posted
|
||||
date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"]))
|
||||
date_posted = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get_text(strip=True)
|
||||
date_posted = parse_date(date_text)
|
||||
|
||||
# Check if job is remote
|
||||
is_remote = is_job_remote(title, location=location)
|
||||
|
||||
# Create job post object
|
||||
job_post = JobPost(
|
||||
id=job_id,
|
||||
title=title,
|
||||
company_name=company_name, # Use company_name instead of company
|
||||
location=location,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
is_remote=is_remote,
|
||||
site=self.site,
|
||||
)
|
||||
|
||||
# Always fetch description for BDJobs
|
||||
job_details = self._get_job_details(job_url)
|
||||
job_post.description = job_details.get("description", "")
|
||||
job_post.job_type = job_details.get("job_type", "")
|
||||
|
||||
return job_post
|
||||
except Exception as e:
|
||||
log.error(f"Error in _process_job: {str(e)}")
|
||||
return None
|
||||
|
||||
def _get_job_details(self, job_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Gets detailed job information from the job page
|
||||
:param job_url: Job page URL
|
||||
:return: Dictionary with job details
|
||||
"""
|
||||
try:
|
||||
response = self.session.get(job_url, timeout=60)
|
||||
if response.status_code != 200:
|
||||
return {}
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Find job description - IMPROVED based on correct.py
|
||||
description = ""
|
||||
|
||||
# Try to find the job content div first (as in correct.py)
|
||||
job_content_div = soup.find('div', class_='jobcontent')
|
||||
if job_content_div:
|
||||
# Look for responsibilities section
|
||||
responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower())
|
||||
if responsibilities_heading:
|
||||
responsibilities_elements = []
|
||||
# Find all following elements until the next heading or hr
|
||||
for sibling in responsibilities_heading.find_next_siblings():
|
||||
if sibling.name in ['hr', 'h4', 'h5']:
|
||||
break
|
||||
if sibling.name == 'ul':
|
||||
responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li'))
|
||||
elif sibling.name == 'p':
|
||||
responsibilities_elements.append(sibling.get_text(separator=' ', strip=True))
|
||||
|
||||
description = "\n".join(responsibilities_elements) if responsibilities_elements else ""
|
||||
|
||||
# If no description found yet, try the original approach
|
||||
if not description:
|
||||
description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"]))
|
||||
if description_elem:
|
||||
description_elem = remove_attributes(description_elem)
|
||||
description = description_elem.prettify(formatter="html")
|
||||
if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
|
||||
# Extract job type
|
||||
job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"]))
|
||||
job_type = None
|
||||
if job_type_elem:
|
||||
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True)
|
||||
job_type = job_type_text if job_type_text else None
|
||||
|
||||
# Extract company industry
|
||||
industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower())
|
||||
company_industry = None
|
||||
if industry_elem:
|
||||
industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True)
|
||||
company_industry = industry_text if industry_text else None
|
||||
|
||||
return {
|
||||
"description": description,
|
||||
"job_type": job_type,
|
||||
"company_industry": company_industry
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error getting job details: {str(e)}")
|
||||
return {}
|
|
@ -0,0 +1,32 @@
|
|||
#constant.py
|
||||
# Headers for BDJobs requests
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Connection": "keep-alive",
|
||||
"Referer": "https://jobs.bdjobs.com/",
|
||||
"Cache-Control": "max-age=0",
|
||||
}
|
||||
|
||||
# Search parameters that work best for BDJobs
|
||||
search_params = {
|
||||
"hidJobSearch": "jobsearch",
|
||||
}
|
||||
|
||||
# Selectors for job listings
|
||||
job_selectors = [
|
||||
"div.job-item", # Catches both normal and premium job cards, as well as other types
|
||||
"div.sout-jobs-wrapper", # Catches job listings in the main search results page
|
||||
"div.norm-jobs-wrapper", # Catches normal job listings
|
||||
"div.featured-wrap", # Catches featured job listings
|
||||
]
|
||||
|
||||
# Date formats used by BDJobs
|
||||
date_formats = [
|
||||
"%d %b %Y",
|
||||
"%d-%b-%Y",
|
||||
"%d %B %Y",
|
||||
"%B %d, %Y",
|
||||
"%d/%m/%Y",
|
||||
]
|
|
@ -0,0 +1,100 @@
|
|||
#util.py
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
from jobspy.model import Location, Country
|
||||
|
||||
|
||||
def parse_location(location_text: str, country: str = "bangladesh") -> Location:
|
||||
"""
|
||||
Parses location text into a Location object
|
||||
:param location_text: Location text from job listing
|
||||
:param country: Default country
|
||||
:return: Location object
|
||||
"""
|
||||
parts = location_text.split(",")
|
||||
if len(parts) >= 2:
|
||||
city = parts[0].strip()
|
||||
state = parts[1].strip()
|
||||
return Location(
|
||||
city=city,
|
||||
state=state,
|
||||
country=Country.from_string(country)
|
||||
)
|
||||
else:
|
||||
return Location(
|
||||
city=location_text.strip(),
|
||||
country=Country.from_string(country)
|
||||
)
|
||||
|
||||
|
||||
def parse_date(date_text: str) -> Optional[datetime]:
|
||||
"""
|
||||
Parses date text into a datetime object
|
||||
:param date_text: Date text from job listing
|
||||
:return: datetime object or None if parsing fails
|
||||
"""
|
||||
from .constant import date_formats
|
||||
|
||||
try:
|
||||
# Clean up date text
|
||||
if "Deadline:" in date_text:
|
||||
date_text = date_text.replace("Deadline:", "").strip()
|
||||
|
||||
# Try different date formats
|
||||
for fmt in date_formats:
|
||||
try:
|
||||
return datetime.strptime(date_text, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_job_listings(soup: BeautifulSoup) -> List[Any]:
|
||||
"""
|
||||
Finds job listing elements in the HTML
|
||||
:param soup: BeautifulSoup object
|
||||
:return: List of job card elements
|
||||
"""
|
||||
from .constant import job_selectors
|
||||
|
||||
# Try different selectors
|
||||
for selector in job_selectors:
|
||||
if "." in selector:
|
||||
tag_name, class_name = selector.split(".", 1)
|
||||
elements = soup.find_all(tag_name, class_=class_name)
|
||||
if elements and len(elements) > 0:
|
||||
return elements
|
||||
|
||||
# If no selectors match, look for job detail links
|
||||
job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower())
|
||||
if job_links:
|
||||
# Return parent elements of job links
|
||||
return [link.parent for link in job_links]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def is_job_remote(title: str, description: str = None, location: Location = None) -> bool:
|
||||
"""
|
||||
Determines if a job is remote based on title, description, and location
|
||||
:param title: Job title
|
||||
:param description: Job description
|
||||
:param location: Job location
|
||||
:return: True if job is remote, False otherwise
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh", "home based"]
|
||||
|
||||
# Combine all text fields
|
||||
full_text = title.lower()
|
||||
if description:
|
||||
full_text += " " + description.lower()
|
||||
if location:
|
||||
full_text += " " + location.display_location().lower()
|
||||
|
||||
# Check for remote keywords
|
||||
return any(keyword in full_text for keyword in remote_keywords)
|
|
@ -38,3 +38,8 @@ class BaytException(Exception):
|
|||
class NaukriException(Exception):
|
||||
def __init__(self,message=None):
|
||||
super().__init__(message or "An error occurred with Naukri")
|
||||
|
||||
|
||||
class BDJobsException(Exception):
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with BDJobs")
|
|
@ -68,6 +68,7 @@ class Country(Enum):
|
|||
AUSTRALIA = ("australia", "au", "com.au")
|
||||
AUSTRIA = ("austria", "at", "at")
|
||||
BAHRAIN = ("bahrain", "bh")
|
||||
BANGLADESH = ("bangladesh", "bd") # Added Bangladesh
|
||||
BELGIUM = ("belgium", "be", "fr:be")
|
||||
BULGARIA = ("bulgaria", "bg")
|
||||
BRAZIL = ("brazil", "br", "com.br")
|
||||
|
@ -291,6 +292,7 @@ class Site(Enum):
|
|||
GOOGLE = "google"
|
||||
BAYT = "bayt"
|
||||
NAUKRI = "naukri"
|
||||
BDJOBS = "bdjobs" # Add this line
|
||||
|
||||
|
||||
class SalarySource(Enum):
|
||||
|
@ -314,6 +316,8 @@ class ScraperInput(BaseModel):
|
|||
linkedin_company_ids: list[int] | None = None
|
||||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||||
|
||||
request_timeout: int = 60
|
||||
|
||||
results_wanted: int = 15
|
||||
hours_old: int | None = None
|
||||
|
||||
|
|
Loading…
Reference in New Issue