Fixed Bayt scraper integration

pull/246/head
Abdulrahman Al Muaitah 2025-02-21 20:10:02 +04:00
parent c6ade14784
commit 11a9e9a56a
1 changed files with 19 additions and 15 deletions

View File

@ -77,7 +77,8 @@ class BaytScraper(Scraper):
Grabs the job results for the given query and page number. Grabs the job results for the given query and page number.
""" """
try: try:
url = f"{self.base_url}/en/jobs/{query}-jobs/?page={page}" # Updated URL to include the "international" segment as per the original code.
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
logger.info(f"Constructed URL: {url}") logger.info(f"Constructed URL: {url}")
headers = { headers = {
"User-Agent": ( "User-Agent": (
@ -89,7 +90,8 @@ class BaytScraper(Scraper):
response = requests.get(url, headers=headers, timeout=10) response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_listings = soup.find_all("li", class_="has-pointer-d") # Use the attribute selector as in the original code.
job_listings = soup.find_all("li", attrs={"data-js-job": ""})
logger.info(f"Found {len(job_listings)} job listing elements") logger.info(f"Found {len(job_listings)} job listing elements")
return job_listings return job_listings
except Exception as e: except Exception as e:
@ -98,28 +100,30 @@ class BaytScraper(Scraper):
def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]: def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]:
""" """
Extracts the job information from a single job listing, Extracts the job information from a single job listing.
mirroring your original code's logic for company and location.
""" """
# The h2 with class jb-title holds the title and link # Find the h2 element holding the title and link (no class filtering)
job_general_information = job.find("h2", class_="jb-title") job_general_information = job.find("h2")
if not job_general_information: if not job_general_information:
return None return None
job_title = job_general_information.text.strip() job_title = job_general_information.get_text(strip=True)
job_url = self._extract_job_url(job_general_information) job_url = self._extract_job_url(job_general_information)
if not job_url: if not job_url:
return None return None
# --- Company Name (original approach) --- # Extract company name using the original approach:
company_tag = job.find("b", class_="jb-company") company_tag = job.find("div", class_="t-nowrap p10l")
company_name = company_tag.text.strip() if company_tag else None company_name = (
company_tag.find("span").get_text(strip=True)
if company_tag and company_tag.find("span")
else None
)
# --- Location (original approach) --- # Extract location using the original approach:
location_tag = job.find("span", class_="jb-loc") location_tag = job.find("div", class_="t-mute t-small")
location = location_tag.text.strip() if location_tag else None location = location_tag.get_text(strip=True) if location_tag else None
# Build our JobPost object
job_id = f"bayt-{abs(hash(job_url))}" job_id = f"bayt-{abs(hash(job_url))}"
location_obj = Location( location_obj = Location(
city=location, city=location,
@ -147,7 +151,7 @@ class BaytScraper(Scraper):
def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]: def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]:
""" """
Pulls the job URL from the 'a' within h2.jb-title. Pulls the job URL from the 'a' within the h2 element.
""" """
a_tag = job_general_information.find("a") a_tag = job_general_information.find("a")
if a_tag and a_tag.has_attr("href"): if a_tag and a_tag.has_attr("href"):