Added Bayt Scraper integration

pull/246/head
Abdulrahman Al Muaitah 2025-02-21 15:31:29 +04:00
parent 13c74a0fed
commit c6ade14784
5 changed files with 162 additions and 0 deletions

View File

@ -11,6 +11,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers.bayt import BaytScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
@ -54,6 +55,7 @@ def scrape_jobs(
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.GOOGLE: GoogleJobsScraper,
Site.BAYT: BaytScraper,
}
set_logger_level(verbose)

View File

@ -18,6 +18,7 @@ class Site(Enum):
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
BAYT = "bayt"
class SalarySource(Enum):

View File

@ -0,0 +1,155 @@
from __future__ import annotations
import time
import random
from typing import Optional
import requests
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site
from ..exceptions import BaytException
from ...jobs import JobPost, JobResponse, Location, Country
from ..utils import create_logger
logger = create_logger("Bayt")
logger.setLevel("DEBUG") # Ensure DEBUG messages are output
class BaytScraper(Scraper):
base_url = "https://www.bayt.com"
delay = 2
band_delay = 3
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None
self.country = "worldwide"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input = scraper_input
job_list: list[JobPost] = []
page = 1
results_wanted = (
scraper_input.results_wanted if scraper_input.results_wanted else 10
)
while len(job_list) < results_wanted:
logger.info(f"Fetching Bayt jobs page {page}")
job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
if not job_elements:
break
if job_elements:
logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500])
initial_count = len(job_list)
for job in job_elements:
try:
job_post = self._extract_job_info(job)
if job_post:
job_list.append(job_post)
if len(job_list) >= results_wanted:
break
else:
logger.debug(
"Extraction returned None. Job snippet:\n"
+ job.prettify()[:500]
)
except Exception as e:
logger.error(f"Bayt: Error extracting job info: {str(e)}")
continue
if len(job_list) == initial_count:
logger.info(f"No new jobs found on page {page}. Ending pagination.")
break
page += 1
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]:
"""
Grabs the job results for the given query and page number.
"""
try:
url = f"{self.base_url}/en/jobs/{query}-jobs/?page={page}"
logger.info(f"Constructed URL: {url}")
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
job_listings = soup.find_all("li", class_="has-pointer-d")
logger.info(f"Found {len(job_listings)} job listing elements")
return job_listings
except Exception as e:
logger.error(f"Bayt: Error fetching jobs - {str(e)}")
return None
def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]:
"""
Extracts the job information from a single job listing,
mirroring your original code's logic for company and location.
"""
# The h2 with class jb-title holds the title and link
job_general_information = job.find("h2", class_="jb-title")
if not job_general_information:
return None
job_title = job_general_information.text.strip()
job_url = self._extract_job_url(job_general_information)
if not job_url:
return None
# --- Company Name (original approach) ---
company_tag = job.find("b", class_="jb-company")
company_name = company_tag.text.strip() if company_tag else None
# --- Location (original approach) ---
location_tag = job.find("span", class_="jb-loc")
location = location_tag.text.strip() if location_tag else None
# Build our JobPost object
job_id = f"bayt-{abs(hash(job_url))}"
location_obj = Location(
city=location,
country=Country.from_string(self.country),
)
return JobPost(
id=job_id,
title=job_title,
company_name=company_name,
company_url="",
location=location_obj,
date_posted=None,
job_url=job_url,
compensation=None,
job_type=None,
job_level=None,
company_industry=None,
description=None,
job_url_direct=None,
emails=[],
company_logo=None,
job_function=None,
)
def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]:
"""
Pulls the job URL from the 'a' within h2.jb-title.
"""
a_tag = job_general_information.find("a")
if a_tag and a_tag.has_attr("href"):
return self.base_url + a_tag["href"].strip()
return None

View File

View File

@ -29,3 +29,7 @@ class GlassdoorException(Exception):
class GoogleJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Google Jobs")
class BaytException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Bayt")