Merge cb6ea53b7c into 94d413bad1

2025-05-14 13:13:49 +06:00 · 2025-05-14 13:13:49 +06:00 · 5266d1edd0
parent 94d413bad1 cb6ea53b7c
commit 5266d1edd0
6 changed files with 432 additions and 2 deletions
--- a/jobspy/init.py
+++ b/jobspy/init.py
@ -6,6 +6,7 @@ from typing import Tuple
 import pandas as pd
 from jobspy.bayt import BaytScraper
 from jobspy.bdjobs import BDJobs
 from jobspy.glassdoor import Glassdoor
 from jobspy.google import Google
 from jobspy.indeed import Indeed
@ -25,6 +26,8 @@ from jobspy.util import (
 from jobspy.ziprecruiter import ZipRecruiter
 # Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
 def scrape_jobs(
    site_name: str | list[str] | Site | list[Site] | None = None,
    search_term: str | None = None,
@ -59,6 +62,7 @@ def scrape_jobs(
        Site.GOOGLE: Google,
        Site.BAYT: BaytScraper,
        Site.NAUKRI: Naukri,
        Site.BDJOBS: BDJobs,  # Add BDJobs to the scraper mapping
    }
    set_logger_level(verbose)
    job_type = get_enum_from_value(job_type) if job_type else None
@ -213,3 +217,9 @@ def scrape_jobs(
        ).reset_index(drop=True)
    else:
        return pd.DataFrame()
 # Add BDJobs to __all__
 __all__ = [
    "BDJobs",
 ]
--- a/jobspy/bdjobs/init.py
+++ b/jobspy/bdjobs/init.py
@ -0,0 +1,279 @@
 #__init__.py
 from __future__ import annotations
 import random
 import time
 from datetime import datetime
 from typing import Optional, List, Dict, Any
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from jobspy.exception import BDJobsException
 from jobspy.bdjobs.constant import headers, search_params
 from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote
 from jobspy.model import (
    JobPost,
    Location,
    JobResponse,
    Country,
    Scraper,
    ScraperInput,
    Site,
    DescriptionFormat,
 )
 from jobspy.util import (
    extract_emails_from_text,
    create_session,
    create_logger,
    remove_attributes,
    markdown_converter,
 )
 log = create_logger("BDJobs")
 class BDJobs(Scraper):
    base_url = "https://jobs.bdjobs.com"
    search_url = "https://jobs.bdjobs.com/jobsearch.asp"
    delay = 2
    band_delay = 3
    def __init__(
        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
    ):
        """
        Initializes BDJobsScraper with the BDJobs job search url
        """
        super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
        self.session = create_session(
            proxies=self.proxies,
            ca_cert=ca_cert,
            is_tls=False,
            has_retry=True,
            delay=5,
            clear_cookies=True,
        )
        self.session.headers.update(headers)
        self.scraper_input = None
        self.country = "bangladesh"
    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
        """
        Scrapes BDJobs for jobs with scraper_input criteria
        :param scraper_input:
        :return: job_response
        """
        self.scraper_input = scraper_input
        job_list: list[JobPost] = []
        seen_ids = set()
        page = 1
        request_count = 0
        # Set up search parameters
        params = search_params.copy()
        params["txtsearch"] = scraper_input.search_term
        continue_search = lambda: len(job_list) < scraper_input.results_wanted
        while continue_search():
            request_count += 1
            log.info(f"search page: {request_count}")
            try:
                # Add page parameter if needed
                if page > 1:
                    params["pg"] = page
                response = self.session.get(
                    self.search_url,
                    params=params,
                    timeout=getattr(scraper_input, 'request_timeout', 60)
                )
                # DEBUG: Save the received HTML content
                try:
                    with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f:
                        f.write(response.text)
                    log.info(f"Saved scraper response to scraper_received_bdjobs.html")
                except Exception as e_write:
                    log.error(f"Error writing debug HTML file: {e_write}")
                if response.status_code != 200:
                    log.error(f"BDJobs response status code {response.status_code}")
                    break
                soup = BeautifulSoup(response.text, "html.parser")
                job_cards = find_job_listings(soup)
                if not job_cards or len(job_cards) == 0:
                    log.info("No more job listings found")
                    break
                log.info(f"Found {len(job_cards)} job cards on page {page}")
                for job_card in job_cards:
                    try:
                        job_post = self._process_job(job_card)
                        if job_post and job_post.id not in seen_ids:
                            seen_ids.add(job_post.id)
                            job_list.append(job_post)
                            if not continue_search():
                                break
                    except Exception as e:
                        log.error(f"Error processing job card: {str(e)}")
                page += 1
                # Add delay between requests
                time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
            except Exception as e:
                log.error(f"Error during scraping: {str(e)}")
                break
        job_list = job_list[:scraper_input.results_wanted]
        return JobResponse(jobs=job_list)
    def _process_job(self, job_card: Tag) -> Optional[JobPost]:
        """
        Processes a job card element into a JobPost object
        :param job_card: Job card element
        :return: JobPost object
        """
        try:
            # Extract job ID and URL
            job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
            if not job_link:
                return None
            job_url = job_link.get("href")
            if not job_url.startswith("http"):
                job_url = urljoin(self.base_url, job_url)
            # Extract job ID from URL
            job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}"
            # Extract title
            title = job_link.get_text(strip=True)
            if not title:
                title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c)
                title = title_elem.get_text(strip=True) if title_elem else "N/A"
            # Extract company name - IMPROVED
            company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower())
            if company_elem:
                company_name = company_elem.get_text(strip=True)
            else:
                # Try alternative selectors
                company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"]))
                company_name = company_elem.get_text(strip=True) if company_elem else "N/A"
            # Extract location
            location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower())
            if not location_elem:
                location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"]))
            location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh"
            # Create Location object
            location = parse_location(location_text, self.country)
            # Extract date posted
            date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"]))
            date_posted = None
            if date_elem:
                date_text = date_elem.get_text(strip=True)
                date_posted = parse_date(date_text)
            # Check if job is remote
            is_remote = is_job_remote(title, location=location)
            # Create job post object
            job_post = JobPost(
                id=job_id,
                title=title,
                company_name=company_name,  # Use company_name instead of company
                location=location,
                date_posted=date_posted,
                job_url=job_url,
                is_remote=is_remote,
                site=self.site,
            )
            # Always fetch description for BDJobs
            job_details = self._get_job_details(job_url)
            job_post.description = job_details.get("description", "")
            job_post.job_type = job_details.get("job_type", "")
            return job_post
        except Exception as e:
            log.error(f"Error in _process_job: {str(e)}")
            return None
    def _get_job_details(self, job_url: str) -> Dict[str, Any]:
        """
        Gets detailed job information from the job page
        :param job_url: Job page URL
        :return: Dictionary with job details
        """
        try:
            response = self.session.get(job_url, timeout=60)
            if response.status_code != 200:
                return {}
            soup = BeautifulSoup(response.text, "html.parser")
            # Find job description - IMPROVED based on correct.py
            description = ""
            # Try to find the job content div first (as in correct.py)
            job_content_div = soup.find('div', class_='jobcontent')
            if job_content_div:
                # Look for responsibilities section
                responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower())
                if responsibilities_heading:
                    responsibilities_elements = []
                    # Find all following elements until the next heading or hr
                    for sibling in responsibilities_heading.find_next_siblings():
                        if sibling.name in ['hr', 'h4', 'h5']:
                            break
                        if sibling.name == 'ul':
                            responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li'))
                        elif sibling.name == 'p':
                            responsibilities_elements.append(sibling.get_text(separator=' ', strip=True))
                description = "\n".join(responsibilities_elements) if responsibilities_elements else ""
            # If no description found yet, try the original approach
            if not description:
                description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"]))
                if description_elem:
                    description_elem = remove_attributes(description_elem)
                    description = description_elem.prettify(formatter="html")
                    if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                        description = markdown_converter(description)
            # Extract job type
            job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"]))
            job_type = None
            if job_type_elem:
                job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True)
                job_type = job_type_text if job_type_text else None
            # Extract company industry
            industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower())
            company_industry = None
            if industry_elem:
                industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True)
                company_industry = industry_text if industry_text else None
            return {
                "description": description,
                "job_type": job_type,
                "company_industry": company_industry
            }
        except Exception as e:
            log.error(f"Error getting job details: {str(e)}")
            return {}
--- a/jobspy/bdjobs/constant.py
+++ b/jobspy/bdjobs/constant.py
@ -0,0 +1,32 @@
 #constant.py
 # Headers for BDJobs requests
 headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive",
    "Referer": "https://jobs.bdjobs.com/",
    "Cache-Control": "max-age=0",
 }
 # Search parameters that work best for BDJobs
 search_params = {
    "hidJobSearch": "jobsearch",
 }
 # Selectors for job listings
 job_selectors = [
    "div.job-item",  # Catches both normal and premium job cards, as well as other types
    "div.sout-jobs-wrapper", # Catches job listings in the main search results page
    "div.norm-jobs-wrapper", # Catches normal job listings
    "div.featured-wrap",     # Catches featured job listings
 ]
 # Date formats used by BDJobs
 date_formats = [
    "%d %b %Y",
    "%d-%b-%Y",
    "%d %B %Y",
    "%B %d, %Y",
    "%d/%m/%Y",
 ]
--- a/jobspy/bdjobs/util.py
+++ b/jobspy/bdjobs/util.py
@ -0,0 +1,100 @@
 #util.py
 from bs4 import BeautifulSoup
 from datetime import datetime
 from typing import Optional, List, Dict, Any
 from jobspy.model import Location, Country
 def parse_location(location_text: str, country: str = "bangladesh") -> Location:
    """
    Parses location text into a Location object
    :param location_text: Location text from job listing
    :param country: Default country
    :return: Location object
    """
    parts = location_text.split(",")
    if len(parts) >= 2:
        city = parts[0].strip()
        state = parts[1].strip()
        return Location(
            city=city,
            state=state,
            country=Country.from_string(country)
        )
    else:
        return Location(
            city=location_text.strip(),
            country=Country.from_string(country)
        )
 def parse_date(date_text: str) -> Optional[datetime]:
    """
    Parses date text into a datetime object
    :param date_text: Date text from job listing
    :return: datetime object or None if parsing fails
    """
    from .constant import date_formats
    try:
        # Clean up date text
        if "Deadline:" in date_text:
            date_text = date_text.replace("Deadline:", "").strip()
        # Try different date formats
        for fmt in date_formats:
            try:
                return datetime.strptime(date_text, fmt)
            except ValueError:
                continue
        return None
    except Exception:
        return None
 def find_job_listings(soup: BeautifulSoup) -> List[Any]:
    """
    Finds job listing elements in the HTML
    :param soup: BeautifulSoup object
    :return: List of job card elements
    """
    from .constant import job_selectors
    # Try different selectors
    for selector in job_selectors:
        if "." in selector:
            tag_name, class_name = selector.split(".", 1)
            elements = soup.find_all(tag_name, class_=class_name)
            if elements and len(elements) > 0:
                return elements
    # If no selectors match, look for job detail links
    job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower())
    if job_links:
        # Return parent elements of job links
        return [link.parent for link in job_links]
    return []
 def is_job_remote(title: str, description: str = None, location: Location = None) -> bool:
    """
    Determines if a job is remote based on title, description, and location
    :param title: Job title
    :param description: Job description
    :param location: Job location
    :return: True if job is remote, False otherwise
    """
    remote_keywords = ["remote", "work from home", "wfh", "home based"]
    # Combine all text fields
    full_text = title.lower()
    if description:
        full_text += " " + description.lower()
    if location:
        full_text += " " + location.display_location().lower()
    # Check for remote keywords
    return any(keyword in full_text for keyword in remote_keywords)
--- a/jobspy/exception.py
+++ b/jobspy/exception.py
@ -38,3 +38,8 @@ class BaytException(Exception):
 class NaukriException(Exception):
    def __init__(self,message=None):
        super().__init__(message or "An error occurred with Naukri")
 class BDJobsException(Exception):
    def __init__(self, message=None):
        super().__init__(message or "An error occurred with BDJobs")
--- a/jobspy/model.py
+++ b/jobspy/model.py
@ -68,6 +68,7 @@ class Country(Enum):
    AUSTRALIA = ("australia", "au", "com.au")
    AUSTRIA = ("austria", "at", "at")
    BAHRAIN = ("bahrain", "bh")
    BANGLADESH = ("bangladesh", "bd")  # Added Bangladesh
    BELGIUM = ("belgium", "be", "fr:be")
    BULGARIA = ("bulgaria", "bg")
    BRAZIL = ("brazil", "br", "com.br")
@ -291,6 +292,7 @@ class Site(Enum):
    GOOGLE = "google"
    BAYT = "bayt"
    NAUKRI = "naukri"
    BDJOBS = "bdjobs"  # Add this line
 class SalarySource(Enum):
@ -314,6 +316,8 @@ class ScraperInput(BaseModel):
    linkedin_company_ids: list[int] | None = None
    description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
    request_timeout: int = 60
    results_wanted: int = 15
    hours_old: int | None = None