From 1be009b8bc91c31eea4516b462afe32180c358ce Mon Sep 17 00:00:00 2001 From: Abdulrahman Hisham <40188935+aHishamm@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:29:54 +0400 Subject: [PATCH] Adding Bayt.com Scraper to current codebase (#246) --- src/jobspy/__init__.py | 2 + src/jobspy/scrapers/__init__.py | 1 + src/jobspy/scrapers/bayt/__init__.py | 159 ++++++++++++++++++++++++++ src/jobspy/scrapers/bayt/constants.py | 0 src/jobspy/scrapers/exceptions.py | 4 + 5 files changed, 166 insertions(+) create mode 100644 src/jobspy/scrapers/bayt/__init__.py create mode 100644 src/jobspy/scrapers/bayt/constants.py diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 0ad21b8..c05c55f 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -11,6 +11,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.glassdoor import GlassdoorScraper from .scrapers.google import GoogleJobsScraper from .scrapers.linkedin import LinkedInScraper +from .scrapers.bayt import BaytScraper from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country from .scrapers.exceptions import ( LinkedInException, @@ -54,6 +55,7 @@ def scrape_jobs( Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.GLASSDOOR: GlassdoorScraper, Site.GOOGLE: GoogleJobsScraper, + Site.BAYT: BaytScraper, } set_logger_level(verbose) diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 25c0841..63c00d5 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -18,6 +18,7 @@ class Site(Enum): ZIP_RECRUITER = "zip_recruiter" GLASSDOOR = "glassdoor" GOOGLE = "google" + BAYT = "bayt" class SalarySource(Enum): diff --git a/src/jobspy/scrapers/bayt/__init__.py b/src/jobspy/scrapers/bayt/__init__.py new file mode 100644 index 0000000..d5c9ddd --- /dev/null +++ b/src/jobspy/scrapers/bayt/__init__.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import time +import random +from typing import Optional + +import requests +from bs4 import BeautifulSoup + +from .. import Scraper, ScraperInput, Site +from ..exceptions import BaytException +from ...jobs import JobPost, JobResponse, Location, Country +from ..utils import create_logger + +logger = create_logger("Bayt") +logger.setLevel("DEBUG") # Ensure DEBUG messages are output + + +class BaytScraper(Scraper): + base_url = "https://www.bayt.com" + delay = 2 + band_delay = 3 + + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): + super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert) + self.scraper_input = None + self.country = "worldwide" + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + self.scraper_input = scraper_input + job_list: list[JobPost] = [] + page = 1 + results_wanted = ( + scraper_input.results_wanted if scraper_input.results_wanted else 10 + ) + + while len(job_list) < results_wanted: + logger.info(f"Fetching Bayt jobs page {page}") + job_elements = self._fetch_jobs(self.scraper_input.search_term, page) + if not job_elements: + break + + if job_elements: + logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500]) + + initial_count = len(job_list) + for job in job_elements: + try: + job_post = self._extract_job_info(job) + if job_post: + job_list.append(job_post) + if len(job_list) >= results_wanted: + break + else: + logger.debug( + "Extraction returned None. Job snippet:\n" + + job.prettify()[:500] + ) + except Exception as e: + logger.error(f"Bayt: Error extracting job info: {str(e)}") + continue + + if len(job_list) == initial_count: + logger.info(f"No new jobs found on page {page}. Ending pagination.") + break + + page += 1 + time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) + + job_list = job_list[: scraper_input.results_wanted] + return JobResponse(jobs=job_list) + + def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]: + """ + Grabs the job results for the given query and page number. + """ + try: + # Updated URL to include the "international" segment as per the original code. + url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}" + logger.info(f"Constructed URL: {url}") + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/115.0.0.0 Safari/537.36" + ) + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + # Use the attribute selector as in the original code. + job_listings = soup.find_all("li", attrs={"data-js-job": ""}) + logger.info(f"Found {len(job_listings)} job listing elements") + return job_listings + except Exception as e: + logger.error(f"Bayt: Error fetching jobs - {str(e)}") + return None + + def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]: + """ + Extracts the job information from a single job listing. + """ + # Find the h2 element holding the title and link (no class filtering) + job_general_information = job.find("h2") + if not job_general_information: + return None + + job_title = job_general_information.get_text(strip=True) + job_url = self._extract_job_url(job_general_information) + if not job_url: + return None + + # Extract company name using the original approach: + company_tag = job.find("div", class_="t-nowrap p10l") + company_name = ( + company_tag.find("span").get_text(strip=True) + if company_tag and company_tag.find("span") + else None + ) + + # Extract location using the original approach: + location_tag = job.find("div", class_="t-mute t-small") + location = location_tag.get_text(strip=True) if location_tag else None + + job_id = f"bayt-{abs(hash(job_url))}" + location_obj = Location( + city=location, + country=Country.from_string(self.country), + ) + + return JobPost( + id=job_id, + title=job_title, + company_name=company_name, + company_url="", + location=location_obj, + date_posted=None, + job_url=job_url, + compensation=None, + job_type=None, + job_level=None, + company_industry=None, + description=None, + job_url_direct=None, + emails=[], + company_logo=None, + job_function=None, + ) + + def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]: + """ + Pulls the job URL from the 'a' within the h2 element. + """ + a_tag = job_general_information.find("a") + if a_tag and a_tag.has_attr("href"): + return self.base_url + a_tag["href"].strip() + return None diff --git a/src/jobspy/scrapers/bayt/constants.py b/src/jobspy/scrapers/bayt/constants.py new file mode 100644 index 0000000..e69de29 diff --git a/src/jobspy/scrapers/exceptions.py b/src/jobspy/scrapers/exceptions.py index eba0479..5fc1d11 100644 --- a/src/jobspy/scrapers/exceptions.py +++ b/src/jobspy/scrapers/exceptions.py @@ -29,3 +29,7 @@ class GlassdoorException(Exception): class GoogleJobsException(Exception): def __init__(self, message=None): super().__init__(message or "An error occurred with Google Jobs") + +class BaytException(Exception): + def __init__(self, message=None): + super().__init__(message or "An error occurred with Bayt")