chore:readme

This commit is contained in:
Cullen Watson
2025-07-28 17:10:15 +02:00
parent ae2b1ea42c
commit 4b16ac7967
3 changed files with 155 additions and 78 deletions

View File

@@ -4,7 +4,7 @@
## Features ## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently
- Aggregates the job postings in a dataframe - Aggregates the job postings in a dataframe
- Proxies support to bypass blocking - Proxies support to bypass blocking
@@ -25,7 +25,7 @@ import csv
from jobspy import scrape_jobs from jobspy import scrape_jobs
jobs = scrape_jobs( jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"], site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
search_term="software engineer", search_term="software engineer",
google_search_term="software engineer jobs near San Francisco, CA since yesterday", google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA", location="San Francisco, CA",
@@ -59,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext ```plaintext
Optional Optional
├── site_name (list|str): ├── site_name (list|str):
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt | linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs
| (default is all) | (default is all)
├── search_term (str) ├── search_term (str)
@@ -86,6 +86,9 @@ Optional
├── easy_apply (bool): ├── easy_apply (bool):
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works) | filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
├── user_agent (str):
| override the default user agent which may be outdated
├── description_format (str): ├── description_format (str):
| markdown, html (Format type of the job descriptions. Default is markdown.) | markdown, html (Format type of the job descriptions. Default is markdown.)

View File

@@ -1,4 +1,4 @@
#__init__.py # __init__.py
from __future__ import annotations from __future__ import annotations
import random import random
@@ -12,7 +12,12 @@ from bs4.element import Tag
from jobspy.exception import BDJobsException from jobspy.exception import BDJobsException
from jobspy.bdjobs.constant import headers, search_params from jobspy.bdjobs.constant import headers, search_params
from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote from jobspy.bdjobs.util import (
parse_location,
parse_date,
find_job_listings,
is_job_remote,
)
from jobspy.model import ( from jobspy.model import (
JobPost, JobPost,
Location, Location,
@@ -39,7 +44,7 @@ class BDJobs(Scraper):
search_url = "https://jobs.bdjobs.com/jobsearch.asp" search_url = "https://jobs.bdjobs.com/jobsearch.asp"
delay = 2 delay = 2
band_delay = 3 band_delay = 3
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None
): ):
@@ -58,7 +63,7 @@ class BDJobs(Scraper):
self.session.headers.update(headers) self.session.headers.update(headers)
self.scraper_input = None self.scraper_input = None
self.country = "bangladesh" self.country = "bangladesh"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
Scrapes BDJobs for jobs with scraper_input criteria Scrapes BDJobs for jobs with scraper_input criteria
@@ -70,72 +75,64 @@ class BDJobs(Scraper):
seen_ids = set() seen_ids = set()
page = 1 page = 1
request_count = 0 request_count = 0
# Set up search parameters # Set up search parameters
params = search_params.copy() params = search_params.copy()
params["txtsearch"] = scraper_input.search_term params["txtsearch"] = scraper_input.search_term
continue_search = lambda: len(job_list) < scraper_input.results_wanted continue_search = lambda: len(job_list) < scraper_input.results_wanted
while continue_search(): while continue_search():
request_count += 1 request_count += 1
log.info(f"search page: {request_count}") log.info(f"search page: {request_count}")
try: try:
# Add page parameter if needed # Add page parameter if needed
if page > 1: if page > 1:
params["pg"] = page params["pg"] = page
response = self.session.get( response = self.session.get(
self.search_url, self.search_url,
params=params, params=params,
timeout=getattr(scraper_input, 'request_timeout', 60) timeout=getattr(scraper_input, "request_timeout", 60),
) )
# DEBUG: Save the received HTML content
try:
with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f:
f.write(response.text)
log.info(f"Saved scraper response to scraper_received_bdjobs.html")
except Exception as e_write:
log.error(f"Error writing debug HTML file: {e_write}")
if response.status_code != 200: if response.status_code != 200:
log.error(f"BDJobs response status code {response.status_code}") log.error(f"BDJobs response status code {response.status_code}")
break break
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = find_job_listings(soup) job_cards = find_job_listings(soup)
if not job_cards or len(job_cards) == 0: if not job_cards or len(job_cards) == 0:
log.info("No more job listings found") log.info("No more job listings found")
break break
log.info(f"Found {len(job_cards)} job cards on page {page}") log.info(f"Found {len(job_cards)} job cards on page {page}")
for job_card in job_cards: for job_card in job_cards:
try: try:
job_post = self._process_job(job_card) job_post = self._process_job(job_card)
if job_post and job_post.id not in seen_ids: if job_post and job_post.id not in seen_ids:
seen_ids.add(job_post.id) seen_ids.add(job_post.id)
job_list.append(job_post) job_list.append(job_post)
if not continue_search(): if not continue_search():
break break
except Exception as e: except Exception as e:
log.error(f"Error processing job card: {str(e)}") log.error(f"Error processing job card: {str(e)}")
page += 1 page += 1
# Add delay between requests # Add delay between requests
time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
except Exception as e: except Exception as e:
log.error(f"Error during scraping: {str(e)}") log.error(f"Error during scraping: {str(e)}")
break break
job_list = job_list[:scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def _process_job(self, job_card: Tag) -> Optional[JobPost]: def _process_job(self, job_card: Tag) -> Optional[JobPost]:
""" """
Processes a job card element into a JobPost object Processes a job card element into a JobPost object
@@ -147,48 +144,88 @@ class BDJobs(Scraper):
job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower()) job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
if not job_link: if not job_link:
return None return None
job_url = job_link.get("href") job_url = job_link.get("href")
if not job_url.startswith("http"): if not job_url.startswith("http"):
job_url = urljoin(self.base_url, job_url) job_url = urljoin(self.base_url, job_url)
# Extract job ID from URL # Extract job ID from URL
job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}" job_id = (
job_url.split("jobid=")[-1].split("&")[0]
if "jobid=" in job_url
else f"bdjobs-{hash(job_url)}"
)
# Extract title # Extract title
title = job_link.get_text(strip=True) title = job_link.get_text(strip=True)
if not title: if not title:
title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c) title_elem = job_card.find(
["h2", "h3", "h4", "strong", "div"],
class_=lambda c: c and "job-title-text" in c,
)
title = title_elem.get_text(strip=True) if title_elem else "N/A" title = title_elem.get_text(strip=True) if title_elem else "N/A"
# Extract company name - IMPROVED # Extract company name - IMPROVED
company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower()) company_elem = job_card.find(
["span", "div"],
class_=lambda c: c and "comp-name-text" in (c or "").lower(),
)
if company_elem: if company_elem:
company_name = company_elem.get_text(strip=True) company_name = company_elem.get_text(strip=True)
else: else:
# Try alternative selectors # Try alternative selectors
company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"])) company_elem = job_card.find(
company_name = company_elem.get_text(strip=True) if company_elem else "N/A" ["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["company", "org", "comp-name"]
),
)
company_name = (
company_elem.get_text(strip=True) if company_elem else "N/A"
)
# Extract location # Extract location
location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower()) location_elem = job_card.find(
["span", "div"],
class_=lambda c: c and "locon-text-d" in (c or "").lower(),
)
if not location_elem: if not location_elem:
location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"])) location_elem = job_card.find(
location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh" ["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["location", "area", "locon"]
),
)
location_text = (
location_elem.get_text(strip=True)
if location_elem
else "Dhaka, Bangladesh"
)
# Create Location object # Create Location object
location = parse_location(location_text, self.country) location = parse_location(location_text, self.country)
# Extract date posted # Extract date posted
date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"])) date_elem = job_card.find(
["span", "div"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["date", "deadline", "published"]
),
)
date_posted = None date_posted = None
if date_elem: if date_elem:
date_text = date_elem.get_text(strip=True) date_text = date_elem.get_text(strip=True)
date_posted = parse_date(date_text) date_posted = parse_date(date_text)
# Check if job is remote # Check if job is remote
is_remote = is_job_remote(title, location=location) is_remote = is_job_remote(title, location=location)
# Create job post object # Create job post object
job_post = JobPost( job_post = JobPost(
id=job_id, id=job_id,
@@ -200,17 +237,17 @@ class BDJobs(Scraper):
is_remote=is_remote, is_remote=is_remote,
site=self.site, site=self.site,
) )
# Always fetch description for BDJobs # Always fetch description for BDJobs
job_details = self._get_job_details(job_url) job_details = self._get_job_details(job_url)
job_post.description = job_details.get("description", "") job_post.description = job_details.get("description", "")
job_post.job_type = job_details.get("job_type", "") job_post.job_type = job_details.get("job_type", "")
return job_post return job_post
except Exception as e: except Exception as e:
log.error(f"Error in _process_job: {str(e)}") log.error(f"Error in _process_job: {str(e)}")
return None return None
def _get_job_details(self, job_url: str) -> Dict[str, Any]: def _get_job_details(self, job_url: str) -> Dict[str, Any]:
""" """
Gets detailed job information from the job page Gets detailed job information from the job page
@@ -221,59 +258,96 @@ class BDJobs(Scraper):
response = self.session.get(job_url, timeout=60) response = self.session.get(job_url, timeout=60)
if response.status_code != 200: if response.status_code != 200:
return {} return {}
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
# Find job description - IMPROVED based on correct.py # Find job description - IMPROVED based on correct.py
description = "" description = ""
# Try to find the job content div first (as in correct.py) # Try to find the job content div first (as in correct.py)
job_content_div = soup.find('div', class_='jobcontent') job_content_div = soup.find("div", class_="jobcontent")
if job_content_div: if job_content_div:
# Look for responsibilities section # Look for responsibilities section
responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower()) responsibilities_heading = job_content_div.find(
"h4", id="job_resp"
) or job_content_div.find(
["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower()
)
if responsibilities_heading: if responsibilities_heading:
responsibilities_elements = [] responsibilities_elements = []
# Find all following elements until the next heading or hr # Find all following elements until the next heading or hr
for sibling in responsibilities_heading.find_next_siblings(): for sibling in responsibilities_heading.find_next_siblings():
if sibling.name in ['hr', 'h4', 'h5']: if sibling.name in ["hr", "h4", "h5"]:
break break
if sibling.name == 'ul': if sibling.name == "ul":
responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li')) responsibilities_elements.extend(
elif sibling.name == 'p': li.get_text(separator=" ", strip=True)
responsibilities_elements.append(sibling.get_text(separator=' ', strip=True)) for li in sibling.find_all("li")
)
description = "\n".join(responsibilities_elements) if responsibilities_elements else "" elif sibling.name == "p":
responsibilities_elements.append(
sibling.get_text(separator=" ", strip=True)
)
description = (
"\n".join(responsibilities_elements)
if responsibilities_elements
else ""
)
# If no description found yet, try the original approach # If no description found yet, try the original approach
if not description: if not description:
description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"])) description_elem = soup.find(
["div", "section"],
class_=lambda c: c
and any(
term in (c or "").lower()
for term in ["job-description", "details", "requirements"]
),
)
if description_elem: if description_elem:
description_elem = remove_attributes(description_elem) description_elem = remove_attributes(description_elem)
description = description_elem.prettify(formatter="html") description = description_elem.prettify(formatter="html")
if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if (
hasattr(self.scraper_input, "description_format")
and self.scraper_input.description_format
== DescriptionFormat.MARKDOWN
):
description = markdown_converter(description) description = markdown_converter(description)
# Extract job type # Extract job type
job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"])) job_type_elem = soup.find(
["span", "div"],
string=lambda s: s
and any(
term in (s or "").lower()
for term in ["job type", "employment type"]
),
)
job_type = None job_type = None
if job_type_elem: if job_type_elem:
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True) job_type_text = job_type_elem.find_next(["span", "div"]).get_text(
strip=True
)
job_type = job_type_text if job_type_text else None job_type = job_type_text if job_type_text else None
# Extract company industry # Extract company industry
industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower()) industry_elem = soup.find(
["span", "div"], string=lambda s: s and "industry" in (s or "").lower()
)
company_industry = None company_industry = None
if industry_elem: if industry_elem:
industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True) industry_text = industry_elem.find_next(["span", "div"]).get_text(
strip=True
)
company_industry = industry_text if industry_text else None company_industry = industry_text if industry_text else None
return { return {
"description": description, "description": description,
"job_type": job_type, "job_type": job_type,
"company_industry": company_industry "company_industry": company_industry,
} }
except Exception as e: except Exception as e:
log.error(f"Error getting job details: {str(e)}") log.error(f"Error getting job details: {str(e)}")
return {} return {}

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.80" version = "1.1.82"
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt" description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"] authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
homepage = "https://github.com/cullenwatson/JobSpy" homepage = "https://github.com/cullenwatson/JobSpy"