diff --git a/jobspy/__init__.py b/jobspy/__init__.py index 7ec88e5..6834f5f 100644 --- a/jobspy/__init__.py +++ b/jobspy/__init__.py @@ -6,6 +6,7 @@ from typing import Tuple import pandas as pd from jobspy.bayt import BaytScraper +from jobspy.bdjobs import BDJobs from jobspy.glassdoor import Glassdoor from jobspy.google import Google from jobspy.indeed import Indeed @@ -25,6 +26,8 @@ from jobspy.util import ( from jobspy.ziprecruiter import ZipRecruiter +# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function + def scrape_jobs( site_name: str | list[str] | Site | list[Site] | None = None, search_term: str | None = None, @@ -59,6 +62,7 @@ def scrape_jobs( Site.GOOGLE: Google, Site.BAYT: BaytScraper, Site.NAUKRI: Naukri, + Site.BDJOBS: BDJobs, # Add BDJobs to the scraper mapping } set_logger_level(verbose) job_type = get_enum_from_value(job_type) if job_type else None @@ -212,4 +216,10 @@ def scrape_jobs( by=["site", "date_posted"], ascending=[True, False] ).reset_index(drop=True) else: - return pd.DataFrame() \ No newline at end of file + return pd.DataFrame() + + +# Add BDJobs to __all__ +__all__ = [ + "BDJobs", +] \ No newline at end of file diff --git a/jobspy/bdjobs/__init__.py b/jobspy/bdjobs/__init__.py new file mode 100644 index 0000000..47ef2de --- /dev/null +++ b/jobspy/bdjobs/__init__.py @@ -0,0 +1,279 @@ +#__init__.py +from __future__ import annotations + +import random +import time +from datetime import datetime +from typing import Optional, List, Dict, Any +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from bs4.element import Tag + +from jobspy.exception import BDJobsException +from jobspy.bdjobs.constant import headers, search_params +from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote +from jobspy.model import ( + JobPost, + Location, + JobResponse, + Country, + Scraper, + ScraperInput, + Site, + DescriptionFormat, +) +from jobspy.util import ( + extract_emails_from_text, + create_session, + create_logger, + remove_attributes, + markdown_converter, +) + +log = create_logger("BDJobs") + + +class BDJobs(Scraper): + base_url = "https://jobs.bdjobs.com" + search_url = "https://jobs.bdjobs.com/jobsearch.asp" + delay = 2 + band_delay = 3 + + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): + """ + Initializes BDJobsScraper with the BDJobs job search url + """ + super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert) + self.session = create_session( + proxies=self.proxies, + ca_cert=ca_cert, + is_tls=False, + has_retry=True, + delay=5, + clear_cookies=True, + ) + self.session.headers.update(headers) + self.scraper_input = None + self.country = "bangladesh" + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """ + Scrapes BDJobs for jobs with scraper_input criteria + :param scraper_input: + :return: job_response + """ + self.scraper_input = scraper_input + job_list: list[JobPost] = [] + seen_ids = set() + page = 1 + request_count = 0 + + # Set up search parameters + params = search_params.copy() + params["txtsearch"] = scraper_input.search_term + + continue_search = lambda: len(job_list) < scraper_input.results_wanted + + while continue_search(): + request_count += 1 + log.info(f"search page: {request_count}") + + try: + # Add page parameter if needed + if page > 1: + params["pg"] = page + + response = self.session.get( + self.search_url, + params=params, + timeout=getattr(scraper_input, 'request_timeout', 60) + ) + + # DEBUG: Save the received HTML content + try: + with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f: + f.write(response.text) + log.info(f"Saved scraper response to scraper_received_bdjobs.html") + except Exception as e_write: + log.error(f"Error writing debug HTML file: {e_write}") + + if response.status_code != 200: + log.error(f"BDJobs response status code {response.status_code}") + break + + soup = BeautifulSoup(response.text, "html.parser") + job_cards = find_job_listings(soup) + + if not job_cards or len(job_cards) == 0: + log.info("No more job listings found") + break + + log.info(f"Found {len(job_cards)} job cards on page {page}") + + for job_card in job_cards: + try: + job_post = self._process_job(job_card) + if job_post and job_post.id not in seen_ids: + seen_ids.add(job_post.id) + job_list.append(job_post) + + if not continue_search(): + break + except Exception as e: + log.error(f"Error processing job card: {str(e)}") + + page += 1 + # Add delay between requests + time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) + + except Exception as e: + log.error(f"Error during scraping: {str(e)}") + break + + job_list = job_list[:scraper_input.results_wanted] + return JobResponse(jobs=job_list) + + def _process_job(self, job_card: Tag) -> Optional[JobPost]: + """ + Processes a job card element into a JobPost object + :param job_card: Job card element + :return: JobPost object + """ + try: + # Extract job ID and URL + job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower()) + if not job_link: + return None + + job_url = job_link.get("href") + if not job_url.startswith("http"): + job_url = urljoin(self.base_url, job_url) + + # Extract job ID from URL + job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}" + + # Extract title + title = job_link.get_text(strip=True) + if not title: + title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c) + title = title_elem.get_text(strip=True) if title_elem else "N/A" + + # Extract company name - IMPROVED + company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower()) + if company_elem: + company_name = company_elem.get_text(strip=True) + else: + # Try alternative selectors + company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"])) + company_name = company_elem.get_text(strip=True) if company_elem else "N/A" + + # Extract location + location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower()) + if not location_elem: + location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"])) + location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh" + + # Create Location object + location = parse_location(location_text, self.country) + + # Extract date posted + date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"])) + date_posted = None + if date_elem: + date_text = date_elem.get_text(strip=True) + date_posted = parse_date(date_text) + + # Check if job is remote + is_remote = is_job_remote(title, location=location) + + # Create job post object + job_post = JobPost( + id=job_id, + title=title, + company_name=company_name, # Use company_name instead of company + location=location, + date_posted=date_posted, + job_url=job_url, + is_remote=is_remote, + site=self.site, + ) + + # Always fetch description for BDJobs + job_details = self._get_job_details(job_url) + job_post.description = job_details.get("description", "") + job_post.job_type = job_details.get("job_type", "") + + return job_post + except Exception as e: + log.error(f"Error in _process_job: {str(e)}") + return None + + def _get_job_details(self, job_url: str) -> Dict[str, Any]: + """ + Gets detailed job information from the job page + :param job_url: Job page URL + :return: Dictionary with job details + """ + try: + response = self.session.get(job_url, timeout=60) + if response.status_code != 200: + return {} + + soup = BeautifulSoup(response.text, "html.parser") + + # Find job description - IMPROVED based on correct.py + description = "" + + # Try to find the job content div first (as in correct.py) + job_content_div = soup.find('div', class_='jobcontent') + if job_content_div: + # Look for responsibilities section + responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower()) + if responsibilities_heading: + responsibilities_elements = [] + # Find all following elements until the next heading or hr + for sibling in responsibilities_heading.find_next_siblings(): + if sibling.name in ['hr', 'h4', 'h5']: + break + if sibling.name == 'ul': + responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li')) + elif sibling.name == 'p': + responsibilities_elements.append(sibling.get_text(separator=' ', strip=True)) + + description = "\n".join(responsibilities_elements) if responsibilities_elements else "" + + # If no description found yet, try the original approach + if not description: + description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"])) + if description_elem: + description_elem = remove_attributes(description_elem) + description = description_elem.prettify(formatter="html") + if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN: + description = markdown_converter(description) + + # Extract job type + job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"])) + job_type = None + if job_type_elem: + job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True) + job_type = job_type_text if job_type_text else None + + # Extract company industry + industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower()) + company_industry = None + if industry_elem: + industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True) + company_industry = industry_text if industry_text else None + + return { + "description": description, + "job_type": job_type, + "company_industry": company_industry + } + + except Exception as e: + log.error(f"Error getting job details: {str(e)}") + return {} \ No newline at end of file diff --git a/jobspy/bdjobs/constant.py b/jobspy/bdjobs/constant.py new file mode 100644 index 0000000..d671a08 --- /dev/null +++ b/jobspy/bdjobs/constant.py @@ -0,0 +1,32 @@ +#constant.py +# Headers for BDJobs requests +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Connection": "keep-alive", + "Referer": "https://jobs.bdjobs.com/", + "Cache-Control": "max-age=0", +} + +# Search parameters that work best for BDJobs +search_params = { + "hidJobSearch": "jobsearch", +} + +# Selectors for job listings +job_selectors = [ + "div.job-item", # Catches both normal and premium job cards, as well as other types + "div.sout-jobs-wrapper", # Catches job listings in the main search results page + "div.norm-jobs-wrapper", # Catches normal job listings + "div.featured-wrap", # Catches featured job listings +] + +# Date formats used by BDJobs +date_formats = [ + "%d %b %Y", + "%d-%b-%Y", + "%d %B %Y", + "%B %d, %Y", + "%d/%m/%Y", +] \ No newline at end of file diff --git a/jobspy/bdjobs/util.py b/jobspy/bdjobs/util.py new file mode 100644 index 0000000..6207786 --- /dev/null +++ b/jobspy/bdjobs/util.py @@ -0,0 +1,100 @@ +#util.py +from bs4 import BeautifulSoup +from datetime import datetime +from typing import Optional, List, Dict, Any + +from jobspy.model import Location, Country + + +def parse_location(location_text: str, country: str = "bangladesh") -> Location: + """ + Parses location text into a Location object + :param location_text: Location text from job listing + :param country: Default country + :return: Location object + """ + parts = location_text.split(",") + if len(parts) >= 2: + city = parts[0].strip() + state = parts[1].strip() + return Location( + city=city, + state=state, + country=Country.from_string(country) + ) + else: + return Location( + city=location_text.strip(), + country=Country.from_string(country) + ) + + +def parse_date(date_text: str) -> Optional[datetime]: + """ + Parses date text into a datetime object + :param date_text: Date text from job listing + :return: datetime object or None if parsing fails + """ + from .constant import date_formats + + try: + # Clean up date text + if "Deadline:" in date_text: + date_text = date_text.replace("Deadline:", "").strip() + + # Try different date formats + for fmt in date_formats: + try: + return datetime.strptime(date_text, fmt) + except ValueError: + continue + + return None + except Exception: + return None + + +def find_job_listings(soup: BeautifulSoup) -> List[Any]: + """ + Finds job listing elements in the HTML + :param soup: BeautifulSoup object + :return: List of job card elements + """ + from .constant import job_selectors + + # Try different selectors + for selector in job_selectors: + if "." in selector: + tag_name, class_name = selector.split(".", 1) + elements = soup.find_all(tag_name, class_=class_name) + if elements and len(elements) > 0: + return elements + + # If no selectors match, look for job detail links + job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower()) + if job_links: + # Return parent elements of job links + return [link.parent for link in job_links] + + return [] + + +def is_job_remote(title: str, description: str = None, location: Location = None) -> bool: + """ + Determines if a job is remote based on title, description, and location + :param title: Job title + :param description: Job description + :param location: Job location + :return: True if job is remote, False otherwise + """ + remote_keywords = ["remote", "work from home", "wfh", "home based"] + + # Combine all text fields + full_text = title.lower() + if description: + full_text += " " + description.lower() + if location: + full_text += " " + location.display_location().lower() + + # Check for remote keywords + return any(keyword in full_text for keyword in remote_keywords) \ No newline at end of file diff --git a/jobspy/exception.py b/jobspy/exception.py index ebd96b4..4fc8578 100644 --- a/jobspy/exception.py +++ b/jobspy/exception.py @@ -37,4 +37,9 @@ class BaytException(Exception): class NaukriException(Exception): def __init__(self,message=None): - super().__init__(message or "An error occurred with Naukri") \ No newline at end of file + super().__init__(message or "An error occurred with Naukri") + + +class BDJobsException(Exception): + def __init__(self, message=None): + super().__init__(message or "An error occurred with BDJobs") \ No newline at end of file diff --git a/jobspy/model.py b/jobspy/model.py index f9155b1..3ba7d03 100644 --- a/jobspy/model.py +++ b/jobspy/model.py @@ -68,6 +68,7 @@ class Country(Enum): AUSTRALIA = ("australia", "au", "com.au") AUSTRIA = ("austria", "at", "at") BAHRAIN = ("bahrain", "bh") + BANGLADESH = ("bangladesh", "bd") # Added Bangladesh BELGIUM = ("belgium", "be", "fr:be") BULGARIA = ("bulgaria", "bg") BRAZIL = ("brazil", "br", "com.br") @@ -291,6 +292,7 @@ class Site(Enum): GOOGLE = "google" BAYT = "bayt" NAUKRI = "naukri" + BDJOBS = "bdjobs" # Add this line class SalarySource(Enum): @@ -314,6 +316,8 @@ class ScraperInput(BaseModel): linkedin_company_ids: list[int] | None = None description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN + request_timeout: int = 60 + results_wanted: int = 15 hours_old: int | None = None