From c6ade14784ef971f55500930b0ae9671556f1ff8 Mon Sep 17 00:00:00 2001 From: Abdulrahman Al Muaitah Date: Fri, 21 Feb 2025 15:31:29 +0400 Subject: [PATCH 1/4] Added Bayt Scraper integration --- src/jobspy/__init__.py | 2 + src/jobspy/scrapers/__init__.py | 1 + src/jobspy/scrapers/bayt/__init__.py | 155 ++++++++++++++++++++++++++ src/jobspy/scrapers/bayt/constants.py | 0 src/jobspy/scrapers/exceptions.py | 4 + 5 files changed, 162 insertions(+) create mode 100644 src/jobspy/scrapers/bayt/__init__.py create mode 100644 src/jobspy/scrapers/bayt/constants.py diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 0ad21b8..c05c55f 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -11,6 +11,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.glassdoor import GlassdoorScraper from .scrapers.google import GoogleJobsScraper from .scrapers.linkedin import LinkedInScraper +from .scrapers.bayt import BaytScraper from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country from .scrapers.exceptions import ( LinkedInException, @@ -54,6 +55,7 @@ def scrape_jobs( Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.GLASSDOOR: GlassdoorScraper, Site.GOOGLE: GoogleJobsScraper, + Site.BAYT: BaytScraper, } set_logger_level(verbose) diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 25c0841..63c00d5 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -18,6 +18,7 @@ class Site(Enum): ZIP_RECRUITER = "zip_recruiter" GLASSDOOR = "glassdoor" GOOGLE = "google" + BAYT = "bayt" class SalarySource(Enum): diff --git a/src/jobspy/scrapers/bayt/__init__.py b/src/jobspy/scrapers/bayt/__init__.py new file mode 100644 index 0000000..6d3b6b5 --- /dev/null +++ b/src/jobspy/scrapers/bayt/__init__.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import time +import random +from typing import Optional + +import requests +from bs4 import BeautifulSoup + +from .. import Scraper, ScraperInput, Site +from ..exceptions import BaytException +from ...jobs import JobPost, JobResponse, Location, Country +from ..utils import create_logger + +logger = create_logger("Bayt") +logger.setLevel("DEBUG") # Ensure DEBUG messages are output + + +class BaytScraper(Scraper): + base_url = "https://www.bayt.com" + delay = 2 + band_delay = 3 + + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): + super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert) + self.scraper_input = None + self.country = "worldwide" + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + self.scraper_input = scraper_input + job_list: list[JobPost] = [] + page = 1 + results_wanted = ( + scraper_input.results_wanted if scraper_input.results_wanted else 10 + ) + + while len(job_list) < results_wanted: + logger.info(f"Fetching Bayt jobs page {page}") + job_elements = self._fetch_jobs(self.scraper_input.search_term, page) + if not job_elements: + break + + if job_elements: + logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500]) + + initial_count = len(job_list) + for job in job_elements: + try: + job_post = self._extract_job_info(job) + if job_post: + job_list.append(job_post) + if len(job_list) >= results_wanted: + break + else: + logger.debug( + "Extraction returned None. Job snippet:\n" + + job.prettify()[:500] + ) + except Exception as e: + logger.error(f"Bayt: Error extracting job info: {str(e)}") + continue + + if len(job_list) == initial_count: + logger.info(f"No new jobs found on page {page}. Ending pagination.") + break + + page += 1 + time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) + + job_list = job_list[: scraper_input.results_wanted] + return JobResponse(jobs=job_list) + + def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]: + """ + Grabs the job results for the given query and page number. + """ + try: + url = f"{self.base_url}/en/jobs/{query}-jobs/?page={page}" + logger.info(f"Constructed URL: {url}") + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/115.0.0.0 Safari/537.36" + ) + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + job_listings = soup.find_all("li", class_="has-pointer-d") + logger.info(f"Found {len(job_listings)} job listing elements") + return job_listings + except Exception as e: + logger.error(f"Bayt: Error fetching jobs - {str(e)}") + return None + + def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]: + """ + Extracts the job information from a single job listing, + mirroring your original code's logic for company and location. + """ + # The h2 with class jb-title holds the title and link + job_general_information = job.find("h2", class_="jb-title") + if not job_general_information: + return None + + job_title = job_general_information.text.strip() + job_url = self._extract_job_url(job_general_information) + if not job_url: + return None + + # --- Company Name (original approach) --- + company_tag = job.find("b", class_="jb-company") + company_name = company_tag.text.strip() if company_tag else None + + # --- Location (original approach) --- + location_tag = job.find("span", class_="jb-loc") + location = location_tag.text.strip() if location_tag else None + + # Build our JobPost object + job_id = f"bayt-{abs(hash(job_url))}" + location_obj = Location( + city=location, + country=Country.from_string(self.country), + ) + + return JobPost( + id=job_id, + title=job_title, + company_name=company_name, + company_url="", + location=location_obj, + date_posted=None, + job_url=job_url, + compensation=None, + job_type=None, + job_level=None, + company_industry=None, + description=None, + job_url_direct=None, + emails=[], + company_logo=None, + job_function=None, + ) + + def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]: + """ + Pulls the job URL from the 'a' within h2.jb-title. + """ + a_tag = job_general_information.find("a") + if a_tag and a_tag.has_attr("href"): + return self.base_url + a_tag["href"].strip() + return None diff --git a/src/jobspy/scrapers/bayt/constants.py b/src/jobspy/scrapers/bayt/constants.py new file mode 100644 index 0000000..e69de29 diff --git a/src/jobspy/scrapers/exceptions.py b/src/jobspy/scrapers/exceptions.py index eba0479..5fc1d11 100644 --- a/src/jobspy/scrapers/exceptions.py +++ b/src/jobspy/scrapers/exceptions.py @@ -29,3 +29,7 @@ class GlassdoorException(Exception): class GoogleJobsException(Exception): def __init__(self, message=None): super().__init__(message or "An error occurred with Google Jobs") + +class BaytException(Exception): + def __init__(self, message=None): + super().__init__(message or "An error occurred with Bayt") From 11a9e9a56ab953a13556c110b74838a57b0c3733 Mon Sep 17 00:00:00 2001 From: Abdulrahman Al Muaitah Date: Fri, 21 Feb 2025 20:10:02 +0400 Subject: [PATCH 2/4] Fixed Bayt scraper integration --- src/jobspy/scrapers/bayt/__init__.py | 34 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/jobspy/scrapers/bayt/__init__.py b/src/jobspy/scrapers/bayt/__init__.py index 6d3b6b5..d5c9ddd 100644 --- a/src/jobspy/scrapers/bayt/__init__.py +++ b/src/jobspy/scrapers/bayt/__init__.py @@ -77,7 +77,8 @@ class BaytScraper(Scraper): Grabs the job results for the given query and page number. """ try: - url = f"{self.base_url}/en/jobs/{query}-jobs/?page={page}" + # Updated URL to include the "international" segment as per the original code. + url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}" logger.info(f"Constructed URL: {url}") headers = { "User-Agent": ( @@ -89,7 +90,8 @@ class BaytScraper(Scraper): response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - job_listings = soup.find_all("li", class_="has-pointer-d") + # Use the attribute selector as in the original code. + job_listings = soup.find_all("li", attrs={"data-js-job": ""}) logger.info(f"Found {len(job_listings)} job listing elements") return job_listings except Exception as e: @@ -98,28 +100,30 @@ class BaytScraper(Scraper): def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]: """ - Extracts the job information from a single job listing, - mirroring your original code's logic for company and location. + Extracts the job information from a single job listing. """ - # The h2 with class jb-title holds the title and link - job_general_information = job.find("h2", class_="jb-title") + # Find the h2 element holding the title and link (no class filtering) + job_general_information = job.find("h2") if not job_general_information: return None - job_title = job_general_information.text.strip() + job_title = job_general_information.get_text(strip=True) job_url = self._extract_job_url(job_general_information) if not job_url: return None - # --- Company Name (original approach) --- - company_tag = job.find("b", class_="jb-company") - company_name = company_tag.text.strip() if company_tag else None + # Extract company name using the original approach: + company_tag = job.find("div", class_="t-nowrap p10l") + company_name = ( + company_tag.find("span").get_text(strip=True) + if company_tag and company_tag.find("span") + else None + ) - # --- Location (original approach) --- - location_tag = job.find("span", class_="jb-loc") - location = location_tag.text.strip() if location_tag else None + # Extract location using the original approach: + location_tag = job.find("div", class_="t-mute t-small") + location = location_tag.get_text(strip=True) if location_tag else None - # Build our JobPost object job_id = f"bayt-{abs(hash(job_url))}" location_obj = Location( city=location, @@ -147,7 +151,7 @@ class BaytScraper(Scraper): def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]: """ - Pulls the job URL from the 'a' within h2.jb-title. + Pulls the job URL from the 'a' within the h2 element. """ a_tag = job_general_information.find("a") if a_tag and a_tag.has_attr("href"): From 81ed9b3ddf3468e6de7579bc86d79378aed28822 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 21 Feb 2025 12:29:28 -0600 Subject: [PATCH 3/4] enh:remove log by default --- src/jobspy/__init__.py | 2 +- src/jobspy/scrapers/bayt/__init__.py | 72 ++++++++------------ src/jobspy/scrapers/bayt/constants.py | 0 src/jobspy/scrapers/exceptions.py | 1 + src/jobspy/scrapers/glassdoor/__init__.py | 16 ++--- src/jobspy/scrapers/google/__init__.py | 17 ++--- src/jobspy/scrapers/indeed/__init__.py | 9 +-- src/jobspy/scrapers/linkedin/__init__.py | 10 +-- src/jobspy/scrapers/utils.py | 9 ++- src/jobspy/scrapers/ziprecruiter/__init__.py | 36 +++++++--- tests/__init__.py | 0 tests/test_all.py | 18 ----- tests/test_glassdoor.py | 13 ---- tests/test_google.py | 12 ---- tests/test_indeed.py | 13 ---- tests/test_linkedin.py | 9 --- tests/test_ziprecruiter.py | 12 ---- 17 files changed, 90 insertions(+), 159 deletions(-) delete mode 100644 src/jobspy/scrapers/bayt/constants.py delete mode 100644 tests/__init__.py delete mode 100644 tests/test_all.py delete mode 100644 tests/test_glassdoor.py delete mode 100644 tests/test_google.py delete mode 100644 tests/test_indeed.py delete mode 100644 tests/test_linkedin.py delete mode 100644 tests/test_ziprecruiter.py diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index c05c55f..8183338 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -42,7 +42,7 @@ def scrape_jobs( offset: int | None = 0, hours_old: int = None, enforce_annual_salary: bool = False, - verbose: int = 2, + verbose: int = 0, **kwargs, ) -> pd.DataFrame: """ diff --git a/src/jobspy/scrapers/bayt/__init__.py b/src/jobspy/scrapers/bayt/__init__.py index d5c9ddd..12b375e 100644 --- a/src/jobspy/scrapers/bayt/__init__.py +++ b/src/jobspy/scrapers/bayt/__init__.py @@ -1,19 +1,22 @@ +""" +jobspy.scrapers.bayt +~~~~~~~~~~~~~~~~~~~ + +This module contains routines to scrape Bayt. +""" + from __future__ import annotations -import time import random -from typing import Optional +import time -import requests from bs4 import BeautifulSoup from .. import Scraper, ScraperInput, Site -from ..exceptions import BaytException +from ..utils import create_logger, create_session from ...jobs import JobPost, JobResponse, Location, Country -from ..utils import create_logger -logger = create_logger("Bayt") -logger.setLevel("DEBUG") # Ensure DEBUG messages are output +log = create_logger("Bayt") class BaytScraper(Scraper): @@ -26,10 +29,14 @@ class BaytScraper(Scraper): ): super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert) self.scraper_input = None + self.session = None self.country = "worldwide" def scrape(self, scraper_input: ScraperInput) -> JobResponse: self.scraper_input = scraper_input + self.session = create_session( + proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True + ) job_list: list[JobPost] = [] page = 1 results_wanted = ( @@ -37,13 +44,15 @@ class BaytScraper(Scraper): ) while len(job_list) < results_wanted: - logger.info(f"Fetching Bayt jobs page {page}") + log.info(f"Fetching Bayt jobs page {page}") job_elements = self._fetch_jobs(self.scraper_input.search_term, page) if not job_elements: break if job_elements: - logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500]) + log.debug( + "First job element snippet:\n" + job_elements[0].prettify()[:500] + ) initial_count = len(job_list) for job in job_elements: @@ -54,16 +63,16 @@ class BaytScraper(Scraper): if len(job_list) >= results_wanted: break else: - logger.debug( + log.debug( "Extraction returned None. Job snippet:\n" + job.prettify()[:500] ) except Exception as e: - logger.error(f"Bayt: Error extracting job info: {str(e)}") + log.error(f"Bayt: Error extracting job info: {str(e)}") continue if len(job_list) == initial_count: - logger.info(f"No new jobs found on page {page}. Ending pagination.") + log.info(f"No new jobs found on page {page}. Ending pagination.") break page += 1 @@ -72,45 +81,35 @@ class BaytScraper(Scraper): job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) - def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]: + def _fetch_jobs(self, query: str, page: int) -> list | None: """ Grabs the job results for the given query and page number. """ try: - # Updated URL to include the "international" segment as per the original code. url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}" - logger.info(f"Constructed URL: {url}") - headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/115.0.0.0 Safari/537.36" - ) - } - response = requests.get(url, headers=headers, timeout=10) + response = self.session.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - # Use the attribute selector as in the original code. job_listings = soup.find_all("li", attrs={"data-js-job": ""}) - logger.info(f"Found {len(job_listings)} job listing elements") + log.debug(f"Found {len(job_listings)} job listing elements") return job_listings except Exception as e: - logger.error(f"Bayt: Error fetching jobs - {str(e)}") + log.error(f"Bayt: Error fetching jobs - {str(e)}") return None - def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]: + def _extract_job_info(self, job: BeautifulSoup) -> JobPost | None: """ Extracts the job information from a single job listing. """ # Find the h2 element holding the title and link (no class filtering) job_general_information = job.find("h2") if not job_general_information: - return None + return job_title = job_general_information.get_text(strip=True) job_url = self._extract_job_url(job_general_information) if not job_url: - return None + return # Extract company name using the original approach: company_tag = job.find("div", class_="t-nowrap p10l") @@ -129,31 +128,18 @@ class BaytScraper(Scraper): city=location, country=Country.from_string(self.country), ) - return JobPost( id=job_id, title=job_title, company_name=company_name, - company_url="", location=location_obj, - date_posted=None, job_url=job_url, - compensation=None, - job_type=None, - job_level=None, - company_industry=None, - description=None, - job_url_direct=None, - emails=[], - company_logo=None, - job_function=None, ) - def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]: + def _extract_job_url(self, job_general_information: BeautifulSoup) -> str | None: """ Pulls the job URL from the 'a' within the h2 element. """ a_tag = job_general_information.find("a") if a_tag and a_tag.has_attr("href"): return self.base_url + a_tag["href"].strip() - return None diff --git a/src/jobspy/scrapers/bayt/constants.py b/src/jobspy/scrapers/bayt/constants.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/jobspy/scrapers/exceptions.py b/src/jobspy/scrapers/exceptions.py index 5fc1d11..ad63b06 100644 --- a/src/jobspy/scrapers/exceptions.py +++ b/src/jobspy/scrapers/exceptions.py @@ -30,6 +30,7 @@ class GoogleJobsException(Exception): def __init__(self, message=None): super().__init__(message or "An error occurred with Google Jobs") + class BaytException(Exception): def __init__(self, message=None): super().__init__(message or "An error occurred with Bayt") diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index d2de6dc..0455ec2 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -32,7 +32,7 @@ from ...jobs import ( DescriptionFormat, ) -logger = create_logger("Glassdoor") +log = create_logger("Glassdoor") class GlassdoorScraper(Scraper): @@ -64,7 +64,7 @@ class GlassdoorScraper(Scraper): self.base_url = self.scraper_input.country.get_glassdoor_url() self.session = create_session( - proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True + proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True ) token = self._get_csrf_token() headers["gd-csrf-token"] = token if token else fallback_token @@ -74,7 +74,7 @@ class GlassdoorScraper(Scraper): scraper_input.location, scraper_input.is_remote ) if location_type is None: - logger.error("Glassdoor: location not parsed") + log.error("Glassdoor: location not parsed") return JobResponse(jobs=[]) job_list: list[JobPost] = [] cursor = None @@ -83,7 +83,7 @@ class GlassdoorScraper(Scraper): tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 range_end = min(tot_pages, self.max_pages + 1) for page in range(range_start, range_end): - logger.info(f"search page: {page} / {range_end-1}") + log.info(f"search page: {page} / {range_end - 1}") try: jobs, cursor = self._fetch_jobs_page( scraper_input, location_id, location_type, page, cursor @@ -93,7 +93,7 @@ class GlassdoorScraper(Scraper): job_list = job_list[: scraper_input.results_wanted] break except Exception as e: - logger.error(f"Glassdoor: {str(e)}") + log.error(f"Glassdoor: {str(e)}") break return JobResponse(jobs=job_list) @@ -129,7 +129,7 @@ class GlassdoorScraper(Scraper): ValueError, Exception, ) as e: - logger.error(f"Glassdoor: {str(e)}") + log.error(f"Glassdoor: {str(e)}") return jobs, None jobs_data = res_json["data"]["jobListings"]["jobListings"] @@ -264,12 +264,12 @@ class GlassdoorScraper(Scraper): if res.status_code != 200: if res.status_code == 429: err = f"429 Response - Blocked by Glassdoor for too many requests" - logger.error(err) + log.error(err) return None, None else: err = f"Glassdoor response status code {res.status_code}" err += f" - {res.text}" - logger.error(f"Glassdoor response status code {res.status_code}") + log.error(f"Glassdoor response status code {res.status_code}") return None, None items = res.json() diff --git a/src/jobspy/scrapers/google/__init__.py b/src/jobspy/scrapers/google/__init__.py index 523e6f5..0c47278 100644 --- a/src/jobspy/scrapers/google/__init__.py +++ b/src/jobspy/scrapers/google/__init__.py @@ -26,7 +26,7 @@ from ...jobs import ( JobType, ) -logger = create_logger("Google") +log = create_logger("Google") class GoogleJobsScraper(Scraper): @@ -61,7 +61,7 @@ class GoogleJobsScraper(Scraper): ) forward_cursor, job_list = self._get_initial_cursor_and_jobs() if forward_cursor is None: - logger.warning( + log.warning( "initial cursor not found, try changing your query or there was at most 10 results" ) return JobResponse(jobs=job_list) @@ -72,16 +72,16 @@ class GoogleJobsScraper(Scraper): len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset and forward_cursor ): - logger.info( + log.info( f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" ) try: jobs, forward_cursor = self._get_jobs_next_page(forward_cursor) except Exception as e: - logger.error(f"failed to get jobs on page: {page}, {e}") + log.error(f"failed to get jobs on page: {page}, {e}") break if not jobs: - logger.info(f"found no jobs on page: {page}") + log.info(f"found no jobs on page: {page}") break job_list += jobs page += 1 @@ -230,10 +230,7 @@ class GoogleJobsScraper(Scraper): @staticmethod def _find_job_info_initial_page(html_text: str): - pattern = ( - f'520084652":(' - + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]" - ) + pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]" results = [] matches = re.finditer(pattern, html_text) @@ -245,6 +242,6 @@ class GoogleJobsScraper(Scraper): results.append(parsed_data) except json.JSONDecodeError as e: - logger.error(f"Failed to parse match: {str(e)}") + log.error(f"Failed to parse match: {str(e)}") results.append({"raw_match": match.group(0), "error": str(e)}) return results diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index bd379ab..b9235ae 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -30,7 +30,7 @@ from ...jobs import ( DescriptionFormat, ) -logger = create_logger("Indeed") +log = create_logger("Indeed") class IndeedScraper(Scraper): @@ -71,12 +71,12 @@ class IndeedScraper(Scraper): cursor = None while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset: - logger.info( + log.info( f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" ) jobs, cursor = self._scrape_page(cursor) if not jobs: - logger.info(f"found no jobs on page: {page}") + log.info(f"found no jobs on page: {page}") break job_list += jobs page += 1 @@ -122,9 +122,10 @@ class IndeedScraper(Scraper): headers=api_headers_temp, json=payload, timeout=10, + verify=False, ) if not response.ok: - logger.info( + log.info( f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)" ) return jobs, new_cursor diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index c3629f6..d854cbe 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -38,7 +38,7 @@ from ..utils import ( markdown_converter, ) -logger = create_logger("LinkedIn") +log = create_logger("LinkedIn") class LinkedInScraper(Scraper): @@ -86,7 +86,7 @@ class LinkedInScraper(Scraper): ) while continue_search(): request_count += 1 - logger.info( + log.info( f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}" ) params = { @@ -126,13 +126,13 @@ class LinkedInScraper(Scraper): else: err = f"LinkedIn response status code {response.status_code}" err += f" - {response.text}" - logger.error(err) + log.error(err) return JobResponse(jobs=job_list) except Exception as e: if "Proxy responded with" in str(e): - logger.error(f"LinkedIn: Bad proxy") + log.error(f"LinkedIn: Bad proxy") else: - logger.error(f"LinkedIn: {str(e)}") + log.error(f"LinkedIn: {str(e)}") return JobResponse(jobs=job_list) soup = BeautifulSoup(response.text, "html.parser") diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 7c032d7..32e0663 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,17 +1,20 @@ from __future__ import annotations -import re import logging +import re from itertools import cycle +import numpy as np import requests import tls_client -import numpy as np +import urllib3 from markdownify import markdownify as md from requests.adapters import HTTPAdapter, Retry from ..jobs import CompensationInterval, JobType +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + def create_logger(name: str): logger = logging.getLogger(f"JobSpy:{name}") @@ -129,7 +132,7 @@ def create_session( return session -def set_logger_level(verbose: int = 2): +def set_logger_level(verbose: int): """ Adjusts the logger's level. This function allows the logging level to be changed at runtime. diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 294ca8c..816331e 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -11,11 +11,10 @@ import json import math import re import time +from concurrent.futures import ThreadPoolExecutor from datetime import datetime from typing import Optional, Tuple, Any -from concurrent.futures import ThreadPoolExecutor - from bs4 import BeautifulSoup from .constants import headers @@ -37,7 +36,7 @@ from ...jobs import ( DescriptionFormat, ) -logger = create_logger("ZipRecruiter") +log = create_logger("ZipRecruiter") class ZipRecruiterScraper(Scraper): @@ -77,7 +76,7 @@ class ZipRecruiterScraper(Scraper): break if page > 1: time.sleep(self.delay) - logger.info(f"search page: {page} / {max_pages}") + log.info(f"search page: {page} / {max_pages}") jobs_on_page, continue_token = self._find_jobs_in_page( scraper_input, continue_token ) @@ -110,13 +109,13 @@ class ZipRecruiterScraper(Scraper): else: err = f"ZipRecruiter response status code {res.status_code}" err += f" with response: {res.text}" # ZipRecruiter likely not available in EU - logger.error(err) + log.error(err) return jobs_list, "" except Exception as e: if "Proxy responded with" in str(e): - logger.error(f"Indeed: Bad proxy") + log.error(f"Indeed: Bad proxy") else: - logger.error(f"Indeed: {str(e)}") + log.error(f"Indeed: {str(e)}") return jobs_list, "" res_data = res.json() @@ -215,7 +214,28 @@ class ZipRecruiterScraper(Scraper): return description_full, job_url_direct def _get_cookies(self): - data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" + """ + Sends a session event to the API with device properties. + """ + data = [ + ("event_type", "session"), + ("logged_in", "false"), + ("number_of_retry", "1"), + ("property", "model:iPhone"), + ("property", "os:iOS"), + ("property", "locale:en_us"), + ("property", "app_build_number:4734"), + ("property", "app_version:91.0"), + ("property", "manufacturer:Apple"), + ("property", "timestamp:2025-01-12T12:04:42-06:00"), + ("property", "screen_height:852"), + ("property", "os_version:16.6.1"), + ("property", "source:install"), + ("property", "screen_width:393"), + ("property", "device_model:iPhone 14 Pro"), + ("property", "brand:Apple"), + ] + url = f"{self.api_url}/jobs-app/event" self.session.post(url, data=data) diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_all.py b/tests/test_all.py deleted file mode 100644 index 3285611..0000000 --- a/tests/test_all.py +++ /dev/null @@ -1,18 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd - - -def test_all(): - sites = [ - "indeed", - "glassdoor", - ] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci - result = scrape_jobs( - site_name=sites, - search_term="engineer", - results_wanted=5, - ) - - assert ( - isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5 - ), "Result should be a non-empty DataFrame" diff --git a/tests/test_glassdoor.py b/tests/test_glassdoor.py deleted file mode 100644 index 267a3e6..0000000 --- a/tests/test_glassdoor.py +++ /dev/null @@ -1,13 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd - - -def test_glassdoor(): - result = scrape_jobs( - site_name="glassdoor", - search_term="engineer", - results_wanted=5, - ) - assert ( - isinstance(result, pd.DataFrame) and len(result) == 5 - ), "Result should be a non-empty DataFrame" diff --git a/tests/test_google.py b/tests/test_google.py deleted file mode 100644 index 9f30ffe..0000000 --- a/tests/test_google.py +++ /dev/null @@ -1,12 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd - - -def test_google(): - result = scrape_jobs( - site_name="google", search_term="software engineer", results_wanted=5 - ) - - assert ( - isinstance(result, pd.DataFrame) and len(result) == 5 - ), "Result should be a non-empty DataFrame" diff --git a/tests/test_indeed.py b/tests/test_indeed.py deleted file mode 100644 index 714fc53..0000000 --- a/tests/test_indeed.py +++ /dev/null @@ -1,13 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd - - -def test_indeed(): - result = scrape_jobs( - site_name="indeed", - search_term="engineer", - results_wanted=5, - ) - assert ( - isinstance(result, pd.DataFrame) and len(result) == 5 - ), "Result should be a non-empty DataFrame" diff --git a/tests/test_linkedin.py b/tests/test_linkedin.py deleted file mode 100644 index 080f4b8..0000000 --- a/tests/test_linkedin.py +++ /dev/null @@ -1,9 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd - - -def test_linkedin(): - result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5) - assert ( - isinstance(result, pd.DataFrame) and len(result) == 5 - ), "Result should be a non-empty DataFrame" diff --git a/tests/test_ziprecruiter.py b/tests/test_ziprecruiter.py deleted file mode 100644 index 61de491..0000000 --- a/tests/test_ziprecruiter.py +++ /dev/null @@ -1,12 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd - - -def test_ziprecruiter(): - result = scrape_jobs( - site_name="zip_recruiter", search_term="software engineer", results_wanted=5 - ) - - assert ( - isinstance(result, pd.DataFrame) and len(result) == 5 - ), "Result should be a non-empty DataFrame" From 30060638750310be546b45865bcf44d9313aae87 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 21 Feb 2025 12:31:04 -0600 Subject: [PATCH 4/4] enh:remove log by default --- .github/workflows/python-test.yml | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 .github/workflows/python-test.yml diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml deleted file mode 100644 index 768c3e4..0000000 --- a/.github/workflows/python-test.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Python Tests - -on: - pull_request: - branches: - - main - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.8' - - name: Install dependencies - run: | - pip install poetry - poetry install - - name: Run tests - run: poetry run pytest tests/test_all.py