chore:readme

2026-03-05 03:54:31 -08:00 · 2025-07-28 17:15:09 +02:00
3 changed files with 156 additions and 78 deletions
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@

 ## Features

- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
+- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently
 - Aggregates the job postings in a dataframe
 - Proxies support to bypass blocking

@@ -25,7 +25,7 @@ import csv
 from jobspy import scrape_jobs

 jobs = scrape_jobs(
-    site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
+    site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
    search_term="software engineer",
    google_search_term="software engineer jobs near San Francisco, CA since yesterday",
    location="San Francisco, CA",
@@ -59,7 +59,7 @@ zip_recruiter Software Developer                 TEKsystems        Phoenix
 ```plaintext
 Optional
 ├── site_name (list|str): 
-|    linkedin, zip_recruiter, indeed, glassdoor, google, bayt
+|    linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs
 |    (default is all)
 │
 ├── search_term (str)
@@ -86,6 +86,10 @@ Optional
 │
 ├── easy_apply (bool): 
 |    filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
+|
+├── user_agent (str): 
+|    override the default user agent which may be outdated
+|
 │
 ├── description_format (str): 
 |    markdown, html (Format type of the job descriptions. Default is markdown.)
--- a/jobspy/bdjobs/init.py
+++ b/jobspy/bdjobs/init.py
@@ -1,4 +1,4 @@
-#__init__.py
+# __init__.py
 from __future__ import annotations

 import random
@@ -12,7 +12,12 @@ from bs4.element import Tag

 from jobspy.exception import BDJobsException
 from jobspy.bdjobs.constant import headers, search_params
-from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote
+from jobspy.bdjobs.util import (
+    parse_location,
+    parse_date,
+    find_job_listings,
+    is_job_remote,
+)
 from jobspy.model import (
    JobPost,
    Location,
@@ -89,17 +94,9 @@ class BDJobs(Scraper):
                response = self.session.get(
                    self.search_url,
                    params=params,
-                    timeout=getattr(scraper_input, 'request_timeout', 60)
+                    timeout=getattr(scraper_input, "request_timeout", 60),
                )

-                # DEBUG: Save the received HTML content
-                try:
-                    with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f:
-                        f.write(response.text)
-                    log.info(f"Saved scraper response to scraper_received_bdjobs.html")
-                except Exception as e_write:
-                    log.error(f"Error writing debug HTML file: {e_write}")
-                
                if response.status_code != 200:
                    log.error(f"BDJobs response status code {response.status_code}")
                    break
@@ -133,7 +130,7 @@ class BDJobs(Scraper):
                log.error(f"Error during scraping: {str(e)}")
                break

-        job_list = job_list[:scraper_input.results_wanted]
+        job_list = job_list[: scraper_input.results_wanted]
        return JobResponse(jobs=job_list)

    def _process_job(self, job_card: Tag) -> Optional[JobPost]:
@@ -153,34 +150,74 @@ class BDJobs(Scraper):
                job_url = urljoin(self.base_url, job_url)

            # Extract job ID from URL
-            job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}"
+            job_id = (
+                job_url.split("jobid=")[-1].split("&")[0]
+                if "jobid=" in job_url
+                else f"bdjobs-{hash(job_url)}"
+            )

            # Extract title
            title = job_link.get_text(strip=True)
            if not title:
-                title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c)
+                title_elem = job_card.find(
+                    ["h2", "h3", "h4", "strong", "div"],
+                    class_=lambda c: c and "job-title-text" in c,
+                )
                title = title_elem.get_text(strip=True) if title_elem else "N/A"

            # Extract company name - IMPROVED
-            company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower())
+            company_elem = job_card.find(
+                ["span", "div"],
+                class_=lambda c: c and "comp-name-text" in (c or "").lower(),
+            )
            if company_elem:
                company_name = company_elem.get_text(strip=True)
            else:
                # Try alternative selectors
-                company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"]))
-                company_name = company_elem.get_text(strip=True) if company_elem else "N/A"
+                company_elem = job_card.find(
+                    ["span", "div"],
+                    class_=lambda c: c
+                    and any(
+                        term in (c or "").lower()
+                        for term in ["company", "org", "comp-name"]
+                    ),
+                )
+                company_name = (
+                    company_elem.get_text(strip=True) if company_elem else "N/A"
+                )

            # Extract location
-            location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower())
+            location_elem = job_card.find(
+                ["span", "div"],
+                class_=lambda c: c and "locon-text-d" in (c or "").lower(),
+            )
            if not location_elem:
-                location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"]))
-            location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh"
+                location_elem = job_card.find(
+                    ["span", "div"],
+                    class_=lambda c: c
+                    and any(
+                        term in (c or "").lower()
+                        for term in ["location", "area", "locon"]
+                    ),
+                )
+            location_text = (
+                location_elem.get_text(strip=True)
+                if location_elem
+                else "Dhaka, Bangladesh"
+            )

            # Create Location object
            location = parse_location(location_text, self.country)

            # Extract date posted
-            date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"]))
+            date_elem = job_card.find(
+                ["span", "div"],
+                class_=lambda c: c
+                and any(
+                    term in (c or "").lower()
+                    for term in ["date", "deadline", "published"]
+                ),
+            )
            date_posted = None
            if date_elem:
                date_text = date_elem.get_text(strip=True)
@@ -228,50 +265,87 @@ class BDJobs(Scraper):
            description = ""

            # Try to find the job content div first (as in correct.py)
-            job_content_div = soup.find('div', class_='jobcontent')
+            job_content_div = soup.find("div", class_="jobcontent")
            if job_content_div:
                # Look for responsibilities section
-                responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower())
+                responsibilities_heading = job_content_div.find(
+                    "h4", id="job_resp"
+                ) or job_content_div.find(
+                    ["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower()
+                )
                if responsibilities_heading:
                    responsibilities_elements = []
                    # Find all following elements until the next heading or hr
                    for sibling in responsibilities_heading.find_next_siblings():
-                        if sibling.name in ['hr', 'h4', 'h5']:
+                        if sibling.name in ["hr", "h4", "h5"]:
                            break
-                        if sibling.name == 'ul':
-                            responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li'))
-                        elif sibling.name == 'p':
-                            responsibilities_elements.append(sibling.get_text(separator=' ', strip=True))
+                        if sibling.name == "ul":
+                            responsibilities_elements.extend(
+                                li.get_text(separator=" ", strip=True)
+                                for li in sibling.find_all("li")
+                            )
+                        elif sibling.name == "p":
+                            responsibilities_elements.append(
+                                sibling.get_text(separator=" ", strip=True)
+                            )

-                description = "\n".join(responsibilities_elements) if responsibilities_elements else ""
+                description = (
+                    "\n".join(responsibilities_elements)
+                    if responsibilities_elements
+                    else ""
+                )

            # If no description found yet, try the original approach
            if not description:
-                description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"]))
+                description_elem = soup.find(
+                    ["div", "section"],
+                    class_=lambda c: c
+                    and any(
+                        term in (c or "").lower()
+                        for term in ["job-description", "details", "requirements"]
+                    ),
+                )
                if description_elem:
                    description_elem = remove_attributes(description_elem)
                    description = description_elem.prettify(formatter="html")
-                    if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
+                    if (
+                        hasattr(self.scraper_input, "description_format")
+                        and self.scraper_input.description_format
+                        == DescriptionFormat.MARKDOWN
+                    ):
                        description = markdown_converter(description)

            # Extract job type
-            job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"]))
+            job_type_elem = soup.find(
+                ["span", "div"],
+                string=lambda s: s
+                and any(
+                    term in (s or "").lower()
+                    for term in ["job type", "employment type"]
+                ),
+            )
            job_type = None
            if job_type_elem:
-                job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True)
+                job_type_text = job_type_elem.find_next(["span", "div"]).get_text(
+                    strip=True
+                )
                job_type = job_type_text if job_type_text else None

            # Extract company industry
-            industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower())
+            industry_elem = soup.find(
+                ["span", "div"], string=lambda s: s and "industry" in (s or "").lower()
+            )
            company_industry = None
            if industry_elem:
-                industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True)
+                industry_text = industry_elem.find_next(["span", "div"]).get_text(
+                    strip=True
+                )
                company_industry = industry_text if industry_text else None

            return {
                "description": description,
                "job_type": job_type,
-                "company_industry": company_industry
+                "company_industry": company_industry,
            }

        except Exception as e:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.80"
+version = "1.1.82"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
 authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
 homepage = "https://github.com/cullenwatson/JobSpy"