Bdjobs Fixed

2025-05-14 13:12:24 +06:00 · 2025-05-14 13:12:24 +06:00 · cb6ea53b7c
parent 94d413bad1
commit cb6ea53b7c
6 changed files with 432 additions and 2 deletions
--- a/jobspy/init.py
+++ b/jobspy/init.py
@ -6,6 +6,7 @@ from typing import Tuple
 import pandas as pd

 from jobspy.bayt import BaytScraper
+from jobspy.bdjobs import BDJobs
 from jobspy.glassdoor import Glassdoor
 from jobspy.google import Google
 from jobspy.indeed import Indeed
@ -25,6 +26,8 @@ from jobspy.util import (
 from jobspy.ziprecruiter import ZipRecruiter


+# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
+
 def scrape_jobs(
    site_name: str | list[str] | Site | list[Site] | None = None,
    search_term: str | None = None,
@ -59,6 +62,7 @@ def scrape_jobs(
        Site.GOOGLE: Google,
        Site.BAYT: BaytScraper,
        Site.NAUKRI: Naukri,
+        Site.BDJOBS: BDJobs,  # Add BDJobs to the scraper mapping
    }
    set_logger_level(verbose)
    job_type = get_enum_from_value(job_type) if job_type else None
@ -213,3 +217,9 @@ def scrape_jobs(
        ).reset_index(drop=True)
    else:
        return pd.DataFrame()
+
+
+# Add BDJobs to __all__
+__all__ = [
+    "BDJobs",
+]
--- a/jobspy/bdjobs/init.py
+++ b/jobspy/bdjobs/init.py
@ -0,0 +1,279 @@
+#__init__.py
+from __future__ import annotations
+
+import random
+import time
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+
+from jobspy.exception import BDJobsException
+from jobspy.bdjobs.constant import headers, search_params
+from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote
+from jobspy.model import (
+    JobPost,
+    Location,
+    JobResponse,
+    Country,
+    Scraper,
+    ScraperInput,
+    Site,
+    DescriptionFormat,
+)
+from jobspy.util import (
+    extract_emails_from_text,
+    create_session,
+    create_logger,
+    remove_attributes,
+    markdown_converter,
+)
+
+log = create_logger("BDJobs")
+
+
+class BDJobs(Scraper):
+    base_url = "https://jobs.bdjobs.com"
+    search_url = "https://jobs.bdjobs.com/jobsearch.asp"
+    delay = 2
+    band_delay = 3
+    
+    def __init__(
+        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
+    ):
+        """
+        Initializes BDJobsScraper with the BDJobs job search url
+        """
+        super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
+        self.session = create_session(
+            proxies=self.proxies,
+            ca_cert=ca_cert,
+            is_tls=False,
+            has_retry=True,
+            delay=5,
+            clear_cookies=True,
+        )
+        self.session.headers.update(headers)
+        self.scraper_input = None
+        self.country = "bangladesh"
+    
+    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+        """
+        Scrapes BDJobs for jobs with scraper_input criteria
+        :param scraper_input:
+        :return: job_response
+        """
+        self.scraper_input = scraper_input
+        job_list: list[JobPost] = []
+        seen_ids = set()
+        page = 1
+        request_count = 0
+        
+        # Set up search parameters
+        params = search_params.copy()
+        params["txtsearch"] = scraper_input.search_term
+        
+        continue_search = lambda: len(job_list) < scraper_input.results_wanted
+        
+        while continue_search():
+            request_count += 1
+            log.info(f"search page: {request_count}")
+            
+            try:
+                # Add page parameter if needed
+                if page > 1:
+                    params["pg"] = page
+                
+                response = self.session.get(
+                    self.search_url,
+                    params=params,
+                    timeout=getattr(scraper_input, 'request_timeout', 60)
+                )
+
+                # DEBUG: Save the received HTML content
+                try:
+                    with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f:
+                        f.write(response.text)
+                    log.info(f"Saved scraper response to scraper_received_bdjobs.html")
+                except Exception as e_write:
+                    log.error(f"Error writing debug HTML file: {e_write}")
+                
+                if response.status_code != 200:
+                    log.error(f"BDJobs response status code {response.status_code}")
+                    break
+                
+                soup = BeautifulSoup(response.text, "html.parser")
+                job_cards = find_job_listings(soup)
+                
+                if not job_cards or len(job_cards) == 0:
+                    log.info("No more job listings found")
+                    break
+                
+                log.info(f"Found {len(job_cards)} job cards on page {page}")
+                
+                for job_card in job_cards:
+                    try:
+                        job_post = self._process_job(job_card)
+                        if job_post and job_post.id not in seen_ids:
+                            seen_ids.add(job_post.id)
+                            job_list.append(job_post)
+                            
+                            if not continue_search():
+                                break
+                    except Exception as e:
+                        log.error(f"Error processing job card: {str(e)}")
+                
+                page += 1
+                # Add delay between requests
+                time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
+                
+            except Exception as e:
+                log.error(f"Error during scraping: {str(e)}")
+                break
+        
+        job_list = job_list[:scraper_input.results_wanted]
+        return JobResponse(jobs=job_list)
+    
+    def _process_job(self, job_card: Tag) -> Optional[JobPost]:
+        """
+        Processes a job card element into a JobPost object
+        :param job_card: Job card element
+        :return: JobPost object
+        """
+        try:
+            # Extract job ID and URL
+            job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
+            if not job_link:
+                return None
+            
+            job_url = job_link.get("href")
+            if not job_url.startswith("http"):
+                job_url = urljoin(self.base_url, job_url)
+            
+            # Extract job ID from URL
+            job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}"
+            
+            # Extract title
+            title = job_link.get_text(strip=True)
+            if not title:
+                title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c)
+                title = title_elem.get_text(strip=True) if title_elem else "N/A"
+            
+            # Extract company name - IMPROVED
+            company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower())
+            if company_elem:
+                company_name = company_elem.get_text(strip=True)
+            else:
+                # Try alternative selectors
+                company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"]))
+                company_name = company_elem.get_text(strip=True) if company_elem else "N/A"
+            
+            # Extract location
+            location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower())
+            if not location_elem:
+                location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"]))
+            location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh"
+            
+            # Create Location object
+            location = parse_location(location_text, self.country)
+            
+            # Extract date posted
+            date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"]))
+            date_posted = None
+            if date_elem:
+                date_text = date_elem.get_text(strip=True)
+                date_posted = parse_date(date_text)
+            
+            # Check if job is remote
+            is_remote = is_job_remote(title, location=location)
+            
+            # Create job post object
+            job_post = JobPost(
+                id=job_id,
+                title=title,
+                company_name=company_name,  # Use company_name instead of company
+                location=location,
+                date_posted=date_posted,
+                job_url=job_url,
+                is_remote=is_remote,
+                site=self.site,
+            )
+            
+            # Always fetch description for BDJobs
+            job_details = self._get_job_details(job_url)
+            job_post.description = job_details.get("description", "")
+            job_post.job_type = job_details.get("job_type", "")
+            
+            return job_post
+        except Exception as e:
+            log.error(f"Error in _process_job: {str(e)}")
+            return None
+    
+    def _get_job_details(self, job_url: str) -> Dict[str, Any]:
+        """
+        Gets detailed job information from the job page
+        :param job_url: Job page URL
+        :return: Dictionary with job details
+        """
+        try:
+            response = self.session.get(job_url, timeout=60)
+            if response.status_code != 200:
+                return {}
+            
+            soup = BeautifulSoup(response.text, "html.parser")
+            
+            # Find job description - IMPROVED based on correct.py
+            description = ""
+            
+            # Try to find the job content div first (as in correct.py)
+            job_content_div = soup.find('div', class_='jobcontent')
+            if job_content_div:
+                # Look for responsibilities section
+                responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower())
+                if responsibilities_heading:
+                    responsibilities_elements = []
+                    # Find all following elements until the next heading or hr
+                    for sibling in responsibilities_heading.find_next_siblings():
+                        if sibling.name in ['hr', 'h4', 'h5']:
+                            break
+                        if sibling.name == 'ul':
+                            responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li'))
+                        elif sibling.name == 'p':
+                            responsibilities_elements.append(sibling.get_text(separator=' ', strip=True))
+                
+                description = "\n".join(responsibilities_elements) if responsibilities_elements else ""
+            
+            # If no description found yet, try the original approach
+            if not description:
+                description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"]))
+                if description_elem:
+                    description_elem = remove_attributes(description_elem)
+                    description = description_elem.prettify(formatter="html")
+                    if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
+                        description = markdown_converter(description)
+            
+            # Extract job type
+            job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"]))
+            job_type = None
+            if job_type_elem:
+                job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True)
+                job_type = job_type_text if job_type_text else None
+            
+            # Extract company industry
+            industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower())
+            company_industry = None
+            if industry_elem:
+                industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True)
+                company_industry = industry_text if industry_text else None
+            
+            return {
+                "description": description,
+                "job_type": job_type,
+                "company_industry": company_industry
+            }
+            
+        except Exception as e:
+            log.error(f"Error getting job details: {str(e)}")
+            return {}
--- a/jobspy/bdjobs/constant.py
+++ b/jobspy/bdjobs/constant.py
@ -0,0 +1,32 @@
+#constant.py
+# Headers for BDJobs requests
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.5",
+    "Connection": "keep-alive",
+    "Referer": "https://jobs.bdjobs.com/",
+    "Cache-Control": "max-age=0",
+}
+
+# Search parameters that work best for BDJobs
+search_params = {
+    "hidJobSearch": "jobsearch",
+}
+
+# Selectors for job listings
+job_selectors = [
+    "div.job-item",  # Catches both normal and premium job cards, as well as other types
+    "div.sout-jobs-wrapper", # Catches job listings in the main search results page
+    "div.norm-jobs-wrapper", # Catches normal job listings
+    "div.featured-wrap",     # Catches featured job listings
+]
+
+# Date formats used by BDJobs
+date_formats = [
+    "%d %b %Y",
+    "%d-%b-%Y",
+    "%d %B %Y",
+    "%B %d, %Y",
+    "%d/%m/%Y",
+]
--- a/jobspy/bdjobs/util.py
+++ b/jobspy/bdjobs/util.py
@ -0,0 +1,100 @@
+#util.py
+from bs4 import BeautifulSoup
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from jobspy.model import Location, Country
+
+
+def parse_location(location_text: str, country: str = "bangladesh") -> Location:
+    """
+    Parses location text into a Location object
+    :param location_text: Location text from job listing
+    :param country: Default country
+    :return: Location object
+    """
+    parts = location_text.split(",")
+    if len(parts) >= 2:
+        city = parts[0].strip()
+        state = parts[1].strip()
+        return Location(
+            city=city,
+            state=state,
+            country=Country.from_string(country)
+        )
+    else:
+        return Location(
+            city=location_text.strip(),
+            country=Country.from_string(country)
+        )
+
+
+def parse_date(date_text: str) -> Optional[datetime]:
+    """
+    Parses date text into a datetime object
+    :param date_text: Date text from job listing
+    :return: datetime object or None if parsing fails
+    """
+    from .constant import date_formats
+    
+    try:
+        # Clean up date text
+        if "Deadline:" in date_text:
+            date_text = date_text.replace("Deadline:", "").strip()
+        
+        # Try different date formats
+        for fmt in date_formats:
+            try:
+                return datetime.strptime(date_text, fmt)
+            except ValueError:
+                continue
+        
+        return None
+    except Exception:
+        return None
+
+
+def find_job_listings(soup: BeautifulSoup) -> List[Any]:
+    """
+    Finds job listing elements in the HTML
+    :param soup: BeautifulSoup object
+    :return: List of job card elements
+    """
+    from .constant import job_selectors
+    
+    # Try different selectors
+    for selector in job_selectors:
+        if "." in selector:
+            tag_name, class_name = selector.split(".", 1)
+            elements = soup.find_all(tag_name, class_=class_name)
+            if elements and len(elements) > 0:
+                return elements
+    
+    # If no selectors match, look for job detail links
+    job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower())
+    if job_links:
+        # Return parent elements of job links
+        return [link.parent for link in job_links]
+    
+    return []
+
+
+def is_job_remote(title: str, description: str = None, location: Location = None) -> bool:
+    """
+    Determines if a job is remote based on title, description, and location
+    :param title: Job title
+    :param description: Job description
+    :param location: Job location
+    :return: True if job is remote, False otherwise
+    """
+    remote_keywords = ["remote", "work from home", "wfh", "home based"]
+    
+    # Combine all text fields
+    full_text = title.lower()
+    if description:
+        full_text += " " + description.lower()
+    if location:
+        full_text += " " + location.display_location().lower()
+    
+    # Check for remote keywords
+    return any(keyword in full_text for keyword in remote_keywords)
--- a/jobspy/exception.py
+++ b/jobspy/exception.py
@ -38,3 +38,8 @@ class BaytException(Exception):
 class NaukriException(Exception):
    def __init__(self,message=None):
        super().__init__(message or "An error occurred with Naukri")
+
+
+class BDJobsException(Exception):
+    def __init__(self, message=None):
+        super().__init__(message or "An error occurred with BDJobs")
--- a/jobspy/model.py
+++ b/jobspy/model.py
@ -68,6 +68,7 @@ class Country(Enum):
    AUSTRALIA = ("australia", "au", "com.au")
    AUSTRIA = ("austria", "at", "at")
    BAHRAIN = ("bahrain", "bh")
+    BANGLADESH = ("bangladesh", "bd")  # Added Bangladesh
    BELGIUM = ("belgium", "be", "fr:be")
    BULGARIA = ("bulgaria", "bg")
    BRAZIL = ("brazil", "br", "com.br")
@ -291,6 +292,7 @@ class Site(Enum):
    GOOGLE = "google"
    BAYT = "bayt"
    NAUKRI = "naukri"
+    BDJOBS = "bdjobs"  # Add this line


 class SalarySource(Enum):
@ -314,6 +316,8 @@ class ScraperInput(BaseModel):
    linkedin_company_ids: list[int] | None = None
    description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN

+    request_timeout: int = 60
+
    results_wanted: int = 15
    hours_old: int | None = None