diff --git a/README.md b/README.md index 1b3fe2d..31e7564 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Features -- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently +- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently - Aggregates the job postings in a dataframe - Proxies support to bypass blocking @@ -25,7 +25,7 @@ import csv from jobspy import scrape_jobs jobs = scrape_jobs( - site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"], + site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs" search_term="software engineer", google_search_term="software engineer jobs near San Francisco, CA since yesterday", location="San Francisco, CA", @@ -59,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix ```plaintext Optional ├── site_name (list|str): -| linkedin, zip_recruiter, indeed, glassdoor, google, bayt +| linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs | (default is all) │ ├── search_term (str) @@ -86,6 +86,9 @@ Optional │ ├── easy_apply (bool): | filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works) +| +├── user_agent (str): +| override the default user agent which may be outdated │ ├── description_format (str): | markdown, html (Format type of the job descriptions. Default is markdown.) diff --git a/jobspy/bdjobs/__init__.py b/jobspy/bdjobs/__init__.py index 47ef2de..a593d2b 100644 --- a/jobspy/bdjobs/__init__.py +++ b/jobspy/bdjobs/__init__.py @@ -1,4 +1,4 @@ -#__init__.py +# __init__.py from __future__ import annotations import random @@ -12,7 +12,12 @@ from bs4.element import Tag from jobspy.exception import BDJobsException from jobspy.bdjobs.constant import headers, search_params -from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote +from jobspy.bdjobs.util import ( + parse_location, + parse_date, + find_job_listings, + is_job_remote, +) from jobspy.model import ( JobPost, Location, @@ -39,7 +44,7 @@ class BDJobs(Scraper): search_url = "https://jobs.bdjobs.com/jobsearch.asp" delay = 2 band_delay = 3 - + def __init__( self, proxies: list[str] | str | None = None, ca_cert: str | None = None ): @@ -58,7 +63,7 @@ class BDJobs(Scraper): self.session.headers.update(headers) self.scraper_input = None self.country = "bangladesh" - + def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ Scrapes BDJobs for jobs with scraper_input criteria @@ -70,72 +75,64 @@ class BDJobs(Scraper): seen_ids = set() page = 1 request_count = 0 - + # Set up search parameters params = search_params.copy() params["txtsearch"] = scraper_input.search_term - + continue_search = lambda: len(job_list) < scraper_input.results_wanted - + while continue_search(): request_count += 1 log.info(f"search page: {request_count}") - + try: # Add page parameter if needed if page > 1: params["pg"] = page - + response = self.session.get( self.search_url, params=params, - timeout=getattr(scraper_input, 'request_timeout', 60) + timeout=getattr(scraper_input, "request_timeout", 60), ) - # DEBUG: Save the received HTML content - try: - with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f: - f.write(response.text) - log.info(f"Saved scraper response to scraper_received_bdjobs.html") - except Exception as e_write: - log.error(f"Error writing debug HTML file: {e_write}") - if response.status_code != 200: log.error(f"BDJobs response status code {response.status_code}") break - + soup = BeautifulSoup(response.text, "html.parser") job_cards = find_job_listings(soup) - + if not job_cards or len(job_cards) == 0: log.info("No more job listings found") break - + log.info(f"Found {len(job_cards)} job cards on page {page}") - + for job_card in job_cards: try: job_post = self._process_job(job_card) if job_post and job_post.id not in seen_ids: seen_ids.add(job_post.id) job_list.append(job_post) - + if not continue_search(): break except Exception as e: log.error(f"Error processing job card: {str(e)}") - + page += 1 # Add delay between requests time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) - + except Exception as e: log.error(f"Error during scraping: {str(e)}") break - - job_list = job_list[:scraper_input.results_wanted] + + job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) - + def _process_job(self, job_card: Tag) -> Optional[JobPost]: """ Processes a job card element into a JobPost object @@ -147,48 +144,88 @@ class BDJobs(Scraper): job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower()) if not job_link: return None - + job_url = job_link.get("href") if not job_url.startswith("http"): job_url = urljoin(self.base_url, job_url) - + # Extract job ID from URL - job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}" - + job_id = ( + job_url.split("jobid=")[-1].split("&")[0] + if "jobid=" in job_url + else f"bdjobs-{hash(job_url)}" + ) + # Extract title title = job_link.get_text(strip=True) if not title: - title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c) + title_elem = job_card.find( + ["h2", "h3", "h4", "strong", "div"], + class_=lambda c: c and "job-title-text" in c, + ) title = title_elem.get_text(strip=True) if title_elem else "N/A" - + # Extract company name - IMPROVED - company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower()) + company_elem = job_card.find( + ["span", "div"], + class_=lambda c: c and "comp-name-text" in (c or "").lower(), + ) if company_elem: company_name = company_elem.get_text(strip=True) else: # Try alternative selectors - company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"])) - company_name = company_elem.get_text(strip=True) if company_elem else "N/A" - + company_elem = job_card.find( + ["span", "div"], + class_=lambda c: c + and any( + term in (c or "").lower() + for term in ["company", "org", "comp-name"] + ), + ) + company_name = ( + company_elem.get_text(strip=True) if company_elem else "N/A" + ) + # Extract location - location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower()) + location_elem = job_card.find( + ["span", "div"], + class_=lambda c: c and "locon-text-d" in (c or "").lower(), + ) if not location_elem: - location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"])) - location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh" - + location_elem = job_card.find( + ["span", "div"], + class_=lambda c: c + and any( + term in (c or "").lower() + for term in ["location", "area", "locon"] + ), + ) + location_text = ( + location_elem.get_text(strip=True) + if location_elem + else "Dhaka, Bangladesh" + ) + # Create Location object location = parse_location(location_text, self.country) - + # Extract date posted - date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"])) + date_elem = job_card.find( + ["span", "div"], + class_=lambda c: c + and any( + term in (c or "").lower() + for term in ["date", "deadline", "published"] + ), + ) date_posted = None if date_elem: date_text = date_elem.get_text(strip=True) date_posted = parse_date(date_text) - + # Check if job is remote is_remote = is_job_remote(title, location=location) - + # Create job post object job_post = JobPost( id=job_id, @@ -200,17 +237,17 @@ class BDJobs(Scraper): is_remote=is_remote, site=self.site, ) - + # Always fetch description for BDJobs job_details = self._get_job_details(job_url) job_post.description = job_details.get("description", "") job_post.job_type = job_details.get("job_type", "") - + return job_post except Exception as e: log.error(f"Error in _process_job: {str(e)}") return None - + def _get_job_details(self, job_url: str) -> Dict[str, Any]: """ Gets detailed job information from the job page @@ -221,59 +258,96 @@ class BDJobs(Scraper): response = self.session.get(job_url, timeout=60) if response.status_code != 200: return {} - + soup = BeautifulSoup(response.text, "html.parser") - + # Find job description - IMPROVED based on correct.py description = "" - + # Try to find the job content div first (as in correct.py) - job_content_div = soup.find('div', class_='jobcontent') + job_content_div = soup.find("div", class_="jobcontent") if job_content_div: # Look for responsibilities section - responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower()) + responsibilities_heading = job_content_div.find( + "h4", id="job_resp" + ) or job_content_div.find( + ["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower() + ) if responsibilities_heading: responsibilities_elements = [] # Find all following elements until the next heading or hr for sibling in responsibilities_heading.find_next_siblings(): - if sibling.name in ['hr', 'h4', 'h5']: + if sibling.name in ["hr", "h4", "h5"]: break - if sibling.name == 'ul': - responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li')) - elif sibling.name == 'p': - responsibilities_elements.append(sibling.get_text(separator=' ', strip=True)) - - description = "\n".join(responsibilities_elements) if responsibilities_elements else "" - + if sibling.name == "ul": + responsibilities_elements.extend( + li.get_text(separator=" ", strip=True) + for li in sibling.find_all("li") + ) + elif sibling.name == "p": + responsibilities_elements.append( + sibling.get_text(separator=" ", strip=True) + ) + + description = ( + "\n".join(responsibilities_elements) + if responsibilities_elements + else "" + ) + # If no description found yet, try the original approach if not description: - description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"])) + description_elem = soup.find( + ["div", "section"], + class_=lambda c: c + and any( + term in (c or "").lower() + for term in ["job-description", "details", "requirements"] + ), + ) if description_elem: description_elem = remove_attributes(description_elem) description = description_elem.prettify(formatter="html") - if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN: + if ( + hasattr(self.scraper_input, "description_format") + and self.scraper_input.description_format + == DescriptionFormat.MARKDOWN + ): description = markdown_converter(description) - + # Extract job type - job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"])) + job_type_elem = soup.find( + ["span", "div"], + string=lambda s: s + and any( + term in (s or "").lower() + for term in ["job type", "employment type"] + ), + ) job_type = None if job_type_elem: - job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True) + job_type_text = job_type_elem.find_next(["span", "div"]).get_text( + strip=True + ) job_type = job_type_text if job_type_text else None - + # Extract company industry - industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower()) + industry_elem = soup.find( + ["span", "div"], string=lambda s: s and "industry" in (s or "").lower() + ) company_industry = None if industry_elem: - industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True) + industry_text = industry_elem.find_next(["span", "div"]).get_text( + strip=True + ) company_industry = industry_text if industry_text else None - + return { "description": description, "job_type": job_type, - "company_industry": company_industry + "company_industry": company_industry, } - + except Exception as e: log.error(f"Error getting job details: {str(e)}") - return {} \ No newline at end of file + return {} diff --git a/pyproject.toml b/pyproject.toml index 824c0f0..52483ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "python-jobspy" -version = "1.1.80" +version = "1.1.82" description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt" authors = ["Cullen Watson ", "Zachary Hampton "] homepage = "https://github.com/cullenwatson/JobSpy"