mirror of https://github.com/Bunsly/JobSpy
279 lines
12 KiB
Python
279 lines
12 KiB
Python
#__init__.py
|
|
from __future__ import annotations
|
|
|
|
import random
|
|
import time
|
|
from datetime import datetime
|
|
from typing import Optional, List, Dict, Any
|
|
from urllib.parse import urljoin
|
|
|
|
from bs4 import BeautifulSoup
|
|
from bs4.element import Tag
|
|
|
|
from jobspy.exception import BDJobsException
|
|
from jobspy.bdjobs.constant import headers, search_params
|
|
from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote
|
|
from jobspy.model import (
|
|
JobPost,
|
|
Location,
|
|
JobResponse,
|
|
Country,
|
|
Scraper,
|
|
ScraperInput,
|
|
Site,
|
|
DescriptionFormat,
|
|
)
|
|
from jobspy.util import (
|
|
extract_emails_from_text,
|
|
create_session,
|
|
create_logger,
|
|
remove_attributes,
|
|
markdown_converter,
|
|
)
|
|
|
|
log = create_logger("BDJobs")
|
|
|
|
|
|
class BDJobs(Scraper):
|
|
base_url = "https://jobs.bdjobs.com"
|
|
search_url = "https://jobs.bdjobs.com/jobsearch.asp"
|
|
delay = 2
|
|
band_delay = 3
|
|
|
|
def __init__(
|
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
|
):
|
|
"""
|
|
Initializes BDJobsScraper with the BDJobs job search url
|
|
"""
|
|
super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
|
|
self.session = create_session(
|
|
proxies=self.proxies,
|
|
ca_cert=ca_cert,
|
|
is_tls=False,
|
|
has_retry=True,
|
|
delay=5,
|
|
clear_cookies=True,
|
|
)
|
|
self.session.headers.update(headers)
|
|
self.scraper_input = None
|
|
self.country = "bangladesh"
|
|
|
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
|
"""
|
|
Scrapes BDJobs for jobs with scraper_input criteria
|
|
:param scraper_input:
|
|
:return: job_response
|
|
"""
|
|
self.scraper_input = scraper_input
|
|
job_list: list[JobPost] = []
|
|
seen_ids = set()
|
|
page = 1
|
|
request_count = 0
|
|
|
|
# Set up search parameters
|
|
params = search_params.copy()
|
|
params["txtsearch"] = scraper_input.search_term
|
|
|
|
continue_search = lambda: len(job_list) < scraper_input.results_wanted
|
|
|
|
while continue_search():
|
|
request_count += 1
|
|
log.info(f"search page: {request_count}")
|
|
|
|
try:
|
|
# Add page parameter if needed
|
|
if page > 1:
|
|
params["pg"] = page
|
|
|
|
response = self.session.get(
|
|
self.search_url,
|
|
params=params,
|
|
timeout=getattr(scraper_input, 'request_timeout', 60)
|
|
)
|
|
|
|
# DEBUG: Save the received HTML content
|
|
try:
|
|
with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f:
|
|
f.write(response.text)
|
|
log.info(f"Saved scraper response to scraper_received_bdjobs.html")
|
|
except Exception as e_write:
|
|
log.error(f"Error writing debug HTML file: {e_write}")
|
|
|
|
if response.status_code != 200:
|
|
log.error(f"BDJobs response status code {response.status_code}")
|
|
break
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
job_cards = find_job_listings(soup)
|
|
|
|
if not job_cards or len(job_cards) == 0:
|
|
log.info("No more job listings found")
|
|
break
|
|
|
|
log.info(f"Found {len(job_cards)} job cards on page {page}")
|
|
|
|
for job_card in job_cards:
|
|
try:
|
|
job_post = self._process_job(job_card)
|
|
if job_post and job_post.id not in seen_ids:
|
|
seen_ids.add(job_post.id)
|
|
job_list.append(job_post)
|
|
|
|
if not continue_search():
|
|
break
|
|
except Exception as e:
|
|
log.error(f"Error processing job card: {str(e)}")
|
|
|
|
page += 1
|
|
# Add delay between requests
|
|
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
|
|
|
except Exception as e:
|
|
log.error(f"Error during scraping: {str(e)}")
|
|
break
|
|
|
|
job_list = job_list[:scraper_input.results_wanted]
|
|
return JobResponse(jobs=job_list)
|
|
|
|
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
|
|
"""
|
|
Processes a job card element into a JobPost object
|
|
:param job_card: Job card element
|
|
:return: JobPost object
|
|
"""
|
|
try:
|
|
# Extract job ID and URL
|
|
job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
|
|
if not job_link:
|
|
return None
|
|
|
|
job_url = job_link.get("href")
|
|
if not job_url.startswith("http"):
|
|
job_url = urljoin(self.base_url, job_url)
|
|
|
|
# Extract job ID from URL
|
|
job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}"
|
|
|
|
# Extract title
|
|
title = job_link.get_text(strip=True)
|
|
if not title:
|
|
title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c)
|
|
title = title_elem.get_text(strip=True) if title_elem else "N/A"
|
|
|
|
# Extract company name - IMPROVED
|
|
company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower())
|
|
if company_elem:
|
|
company_name = company_elem.get_text(strip=True)
|
|
else:
|
|
# Try alternative selectors
|
|
company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"]))
|
|
company_name = company_elem.get_text(strip=True) if company_elem else "N/A"
|
|
|
|
# Extract location
|
|
location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower())
|
|
if not location_elem:
|
|
location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"]))
|
|
location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh"
|
|
|
|
# Create Location object
|
|
location = parse_location(location_text, self.country)
|
|
|
|
# Extract date posted
|
|
date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"]))
|
|
date_posted = None
|
|
if date_elem:
|
|
date_text = date_elem.get_text(strip=True)
|
|
date_posted = parse_date(date_text)
|
|
|
|
# Check if job is remote
|
|
is_remote = is_job_remote(title, location=location)
|
|
|
|
# Create job post object
|
|
job_post = JobPost(
|
|
id=job_id,
|
|
title=title,
|
|
company_name=company_name, # Use company_name instead of company
|
|
location=location,
|
|
date_posted=date_posted,
|
|
job_url=job_url,
|
|
is_remote=is_remote,
|
|
site=self.site,
|
|
)
|
|
|
|
# Always fetch description for BDJobs
|
|
job_details = self._get_job_details(job_url)
|
|
job_post.description = job_details.get("description", "")
|
|
job_post.job_type = job_details.get("job_type", "")
|
|
|
|
return job_post
|
|
except Exception as e:
|
|
log.error(f"Error in _process_job: {str(e)}")
|
|
return None
|
|
|
|
def _get_job_details(self, job_url: str) -> Dict[str, Any]:
|
|
"""
|
|
Gets detailed job information from the job page
|
|
:param job_url: Job page URL
|
|
:return: Dictionary with job details
|
|
"""
|
|
try:
|
|
response = self.session.get(job_url, timeout=60)
|
|
if response.status_code != 200:
|
|
return {}
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Find job description - IMPROVED based on correct.py
|
|
description = ""
|
|
|
|
# Try to find the job content div first (as in correct.py)
|
|
job_content_div = soup.find('div', class_='jobcontent')
|
|
if job_content_div:
|
|
# Look for responsibilities section
|
|
responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower())
|
|
if responsibilities_heading:
|
|
responsibilities_elements = []
|
|
# Find all following elements until the next heading or hr
|
|
for sibling in responsibilities_heading.find_next_siblings():
|
|
if sibling.name in ['hr', 'h4', 'h5']:
|
|
break
|
|
if sibling.name == 'ul':
|
|
responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li'))
|
|
elif sibling.name == 'p':
|
|
responsibilities_elements.append(sibling.get_text(separator=' ', strip=True))
|
|
|
|
description = "\n".join(responsibilities_elements) if responsibilities_elements else ""
|
|
|
|
# If no description found yet, try the original approach
|
|
if not description:
|
|
description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"]))
|
|
if description_elem:
|
|
description_elem = remove_attributes(description_elem)
|
|
description = description_elem.prettify(formatter="html")
|
|
if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
|
description = markdown_converter(description)
|
|
|
|
# Extract job type
|
|
job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"]))
|
|
job_type = None
|
|
if job_type_elem:
|
|
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True)
|
|
job_type = job_type_text if job_type_text else None
|
|
|
|
# Extract company industry
|
|
industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower())
|
|
company_industry = None
|
|
if industry_elem:
|
|
industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True)
|
|
company_industry = industry_text if industry_text else None
|
|
|
|
return {
|
|
"description": description,
|
|
"job_type": job_type,
|
|
"company_industry": company_industry
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(f"Error getting job details: {str(e)}")
|
|
return {} |