enh:remove log by default

main
Cullen Watson 2025-02-21 12:29:28 -06:00
parent 11a9e9a56a
commit 81ed9b3ddf
17 changed files with 90 additions and 159 deletions

View File

@ -42,7 +42,7 @@ def scrape_jobs(
offset: int | None = 0, offset: int | None = 0,
hours_old: int = None, hours_old: int = None,
enforce_annual_salary: bool = False, enforce_annual_salary: bool = False,
verbose: int = 2, verbose: int = 0,
**kwargs, **kwargs,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """

View File

@ -1,19 +1,22 @@
"""
jobspy.scrapers.bayt
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Bayt.
"""
from __future__ import annotations from __future__ import annotations
import time
import random import random
from typing import Optional import time
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import BaytException from ..utils import create_logger, create_session
from ...jobs import JobPost, JobResponse, Location, Country from ...jobs import JobPost, JobResponse, Location, Country
from ..utils import create_logger
logger = create_logger("Bayt") log = create_logger("Bayt")
logger.setLevel("DEBUG") # Ensure DEBUG messages are output
class BaytScraper(Scraper): class BaytScraper(Scraper):
@ -26,10 +29,14 @@ class BaytScraper(Scraper):
): ):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert) super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None self.scraper_input = None
self.session = None
self.country = "worldwide" self.country = "worldwide"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input = scraper_input self.scraper_input = scraper_input
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
job_list: list[JobPost] = [] job_list: list[JobPost] = []
page = 1 page = 1
results_wanted = ( results_wanted = (
@ -37,13 +44,15 @@ class BaytScraper(Scraper):
) )
while len(job_list) < results_wanted: while len(job_list) < results_wanted:
logger.info(f"Fetching Bayt jobs page {page}") log.info(f"Fetching Bayt jobs page {page}")
job_elements = self._fetch_jobs(self.scraper_input.search_term, page) job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
if not job_elements: if not job_elements:
break break
if job_elements: if job_elements:
logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500]) log.debug(
"First job element snippet:\n" + job_elements[0].prettify()[:500]
)
initial_count = len(job_list) initial_count = len(job_list)
for job in job_elements: for job in job_elements:
@ -54,16 +63,16 @@ class BaytScraper(Scraper):
if len(job_list) >= results_wanted: if len(job_list) >= results_wanted:
break break
else: else:
logger.debug( log.debug(
"Extraction returned None. Job snippet:\n" "Extraction returned None. Job snippet:\n"
+ job.prettify()[:500] + job.prettify()[:500]
) )
except Exception as e: except Exception as e:
logger.error(f"Bayt: Error extracting job info: {str(e)}") log.error(f"Bayt: Error extracting job info: {str(e)}")
continue continue
if len(job_list) == initial_count: if len(job_list) == initial_count:
logger.info(f"No new jobs found on page {page}. Ending pagination.") log.info(f"No new jobs found on page {page}. Ending pagination.")
break break
page += 1 page += 1
@ -72,45 +81,35 @@ class BaytScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]: def _fetch_jobs(self, query: str, page: int) -> list | None:
""" """
Grabs the job results for the given query and page number. Grabs the job results for the given query and page number.
""" """
try: try:
# Updated URL to include the "international" segment as per the original code.
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}" url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
logger.info(f"Constructed URL: {url}") response = self.session.get(url)
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
# Use the attribute selector as in the original code.
job_listings = soup.find_all("li", attrs={"data-js-job": ""}) job_listings = soup.find_all("li", attrs={"data-js-job": ""})
logger.info(f"Found {len(job_listings)} job listing elements") log.debug(f"Found {len(job_listings)} job listing elements")
return job_listings return job_listings
except Exception as e: except Exception as e:
logger.error(f"Bayt: Error fetching jobs - {str(e)}") log.error(f"Bayt: Error fetching jobs - {str(e)}")
return None return None
def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]: def _extract_job_info(self, job: BeautifulSoup) -> JobPost | None:
""" """
Extracts the job information from a single job listing. Extracts the job information from a single job listing.
""" """
# Find the h2 element holding the title and link (no class filtering) # Find the h2 element holding the title and link (no class filtering)
job_general_information = job.find("h2") job_general_information = job.find("h2")
if not job_general_information: if not job_general_information:
return None return
job_title = job_general_information.get_text(strip=True) job_title = job_general_information.get_text(strip=True)
job_url = self._extract_job_url(job_general_information) job_url = self._extract_job_url(job_general_information)
if not job_url: if not job_url:
return None return
# Extract company name using the original approach: # Extract company name using the original approach:
company_tag = job.find("div", class_="t-nowrap p10l") company_tag = job.find("div", class_="t-nowrap p10l")
@ -129,31 +128,18 @@ class BaytScraper(Scraper):
city=location, city=location,
country=Country.from_string(self.country), country=Country.from_string(self.country),
) )
return JobPost( return JobPost(
id=job_id, id=job_id,
title=job_title, title=job_title,
company_name=company_name, company_name=company_name,
company_url="",
location=location_obj, location=location_obj,
date_posted=None,
job_url=job_url, job_url=job_url,
compensation=None,
job_type=None,
job_level=None,
company_industry=None,
description=None,
job_url_direct=None,
emails=[],
company_logo=None,
job_function=None,
) )
def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]: def _extract_job_url(self, job_general_information: BeautifulSoup) -> str | None:
""" """
Pulls the job URL from the 'a' within the h2 element. Pulls the job URL from the 'a' within the h2 element.
""" """
a_tag = job_general_information.find("a") a_tag = job_general_information.find("a")
if a_tag and a_tag.has_attr("href"): if a_tag and a_tag.has_attr("href"):
return self.base_url + a_tag["href"].strip() return self.base_url + a_tag["href"].strip()
return None

View File

@ -30,6 +30,7 @@ class GoogleJobsException(Exception):
def __init__(self, message=None): def __init__(self, message=None):
super().__init__(message or "An error occurred with Google Jobs") super().__init__(message or "An error occurred with Google Jobs")
class BaytException(Exception): class BaytException(Exception):
def __init__(self, message=None): def __init__(self, message=None):
super().__init__(message or "An error occurred with Bayt") super().__init__(message or "An error occurred with Bayt")

View File

@ -32,7 +32,7 @@ from ...jobs import (
DescriptionFormat, DescriptionFormat,
) )
logger = create_logger("Glassdoor") log = create_logger("Glassdoor")
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
@ -64,7 +64,7 @@ class GlassdoorScraper(Scraper):
self.base_url = self.scraper_input.country.get_glassdoor_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session( self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
) )
token = self._get_csrf_token() token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token headers["gd-csrf-token"] = token if token else fallback_token
@ -74,7 +74,7 @@ class GlassdoorScraper(Scraper):
scraper_input.location, scraper_input.is_remote scraper_input.location, scraper_input.is_remote
) )
if location_type is None: if location_type is None:
logger.error("Glassdoor: location not parsed") log.error("Glassdoor: location not parsed")
return JobResponse(jobs=[]) return JobResponse(jobs=[])
job_list: list[JobPost] = [] job_list: list[JobPost] = []
cursor = None cursor = None
@ -83,7 +83,7 @@ class GlassdoorScraper(Scraper):
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1) range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end): for page in range(range_start, range_end):
logger.info(f"search page: {page} / {range_end-1}") log.info(f"search page: {page} / {range_end - 1}")
try: try:
jobs, cursor = self._fetch_jobs_page( jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor scraper_input, location_id, location_type, page, cursor
@ -93,7 +93,7 @@ class GlassdoorScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
break break
except Exception as e: except Exception as e:
logger.error(f"Glassdoor: {str(e)}") log.error(f"Glassdoor: {str(e)}")
break break
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
@ -129,7 +129,7 @@ class GlassdoorScraper(Scraper):
ValueError, ValueError,
Exception, Exception,
) as e: ) as e:
logger.error(f"Glassdoor: {str(e)}") log.error(f"Glassdoor: {str(e)}")
return jobs, None return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs_data = res_json["data"]["jobListings"]["jobListings"]
@ -264,12 +264,12 @@ class GlassdoorScraper(Scraper):
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:
err = f"429 Response - Blocked by Glassdoor for too many requests" err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err) log.error(err)
return None, None return None, None
else: else:
err = f"Glassdoor response status code {res.status_code}" err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}" err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}") log.error(f"Glassdoor response status code {res.status_code}")
return None, None return None, None
items = res.json() items = res.json()

View File

@ -26,7 +26,7 @@ from ...jobs import (
JobType, JobType,
) )
logger = create_logger("Google") log = create_logger("Google")
class GoogleJobsScraper(Scraper): class GoogleJobsScraper(Scraper):
@ -61,7 +61,7 @@ class GoogleJobsScraper(Scraper):
) )
forward_cursor, job_list = self._get_initial_cursor_and_jobs() forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None: if forward_cursor is None:
logger.warning( log.warning(
"initial cursor not found, try changing your query or there was at most 10 results" "initial cursor not found, try changing your query or there was at most 10 results"
) )
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
@ -72,16 +72,16 @@ class GoogleJobsScraper(Scraper):
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
and forward_cursor and forward_cursor
): ):
logger.info( log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
) )
try: try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor) jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e: except Exception as e:
logger.error(f"failed to get jobs on page: {page}, {e}") log.error(f"failed to get jobs on page: {page}, {e}")
break break
if not jobs: if not jobs:
logger.info(f"found no jobs on page: {page}") log.info(f"found no jobs on page: {page}")
break break
job_list += jobs job_list += jobs
page += 1 page += 1
@ -230,10 +230,7 @@ class GoogleJobsScraper(Scraper):
@staticmethod @staticmethod
def _find_job_info_initial_page(html_text: str): def _find_job_info_initial_page(html_text: str):
pattern = ( pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
f'520084652":('
+ r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
)
results = [] results = []
matches = re.finditer(pattern, html_text) matches = re.finditer(pattern, html_text)
@ -245,6 +242,6 @@ class GoogleJobsScraper(Scraper):
results.append(parsed_data) results.append(parsed_data)
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logger.error(f"Failed to parse match: {str(e)}") log.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)}) results.append({"raw_match": match.group(0), "error": str(e)})
return results return results

View File

@ -30,7 +30,7 @@ from ...jobs import (
DescriptionFormat, DescriptionFormat,
) )
logger = create_logger("Indeed") log = create_logger("Indeed")
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
@ -71,12 +71,12 @@ class IndeedScraper(Scraper):
cursor = None cursor = None
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset: while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info( log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
) )
jobs, cursor = self._scrape_page(cursor) jobs, cursor = self._scrape_page(cursor)
if not jobs: if not jobs:
logger.info(f"found no jobs on page: {page}") log.info(f"found no jobs on page: {page}")
break break
job_list += jobs job_list += jobs
page += 1 page += 1
@ -122,9 +122,10 @@ class IndeedScraper(Scraper):
headers=api_headers_temp, headers=api_headers_temp,
json=payload, json=payload,
timeout=10, timeout=10,
verify=False,
) )
if not response.ok: if not response.ok:
logger.info( log.info(
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)" f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
) )
return jobs, new_cursor return jobs, new_cursor

View File

@ -38,7 +38,7 @@ from ..utils import (
markdown_converter, markdown_converter,
) )
logger = create_logger("LinkedIn") log = create_logger("LinkedIn")
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
@ -86,7 +86,7 @@ class LinkedInScraper(Scraper):
) )
while continue_search(): while continue_search():
request_count += 1 request_count += 1
logger.info( log.info(
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}" f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
) )
params = { params = {
@ -126,13 +126,13 @@ class LinkedInScraper(Scraper):
else: else:
err = f"LinkedIn response status code {response.status_code}" err = f"LinkedIn response status code {response.status_code}"
err += f" - {response.text}" err += f" - {response.text}"
logger.error(err) log.error(err)
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f"LinkedIn: Bad proxy") log.error(f"LinkedIn: Bad proxy")
else: else:
logger.error(f"LinkedIn: {str(e)}") log.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")

View File

@ -1,17 +1,20 @@
from __future__ import annotations from __future__ import annotations
import re
import logging import logging
import re
from itertools import cycle from itertools import cycle
import numpy as np
import requests import requests
import tls_client import tls_client
import numpy as np import urllib3
from markdownify import markdownify as md from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType from ..jobs import CompensationInterval, JobType
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def create_logger(name: str): def create_logger(name: str):
logger = logging.getLogger(f"JobSpy:{name}") logger = logging.getLogger(f"JobSpy:{name}")
@ -129,7 +132,7 @@ def create_session(
return session return session
def set_logger_level(verbose: int = 2): def set_logger_level(verbose: int):
""" """
Adjusts the logger's level. This function allows the logging level to be changed at runtime. Adjusts the logger's level. This function allows the logging level to be changed at runtime.

View File

@ -11,11 +11,10 @@ import json
import math import math
import re import re
import time import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime from datetime import datetime
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .constants import headers from .constants import headers
@ -37,7 +36,7 @@ from ...jobs import (
DescriptionFormat, DescriptionFormat,
) )
logger = create_logger("ZipRecruiter") log = create_logger("ZipRecruiter")
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@ -77,7 +76,7 @@ class ZipRecruiterScraper(Scraper):
break break
if page > 1: if page > 1:
time.sleep(self.delay) time.sleep(self.delay)
logger.info(f"search page: {page} / {max_pages}") log.info(f"search page: {page} / {max_pages}")
jobs_on_page, continue_token = self._find_jobs_in_page( jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token scraper_input, continue_token
) )
@ -110,13 +109,13 @@ class ZipRecruiterScraper(Scraper):
else: else:
err = f"ZipRecruiter response status code {res.status_code}" err = f"ZipRecruiter response status code {res.status_code}"
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
logger.error(err) log.error(err)
return jobs_list, "" return jobs_list, ""
except Exception as e: except Exception as e:
if "Proxy responded with" in str(e): if "Proxy responded with" in str(e):
logger.error(f"Indeed: Bad proxy") log.error(f"Indeed: Bad proxy")
else: else:
logger.error(f"Indeed: {str(e)}") log.error(f"Indeed: {str(e)}")
return jobs_list, "" return jobs_list, ""
res_data = res.json() res_data = res.json()
@ -215,7 +214,28 @@ class ZipRecruiterScraper(Scraper):
return description_full, job_url_direct return description_full, job_url_direct
def _get_cookies(self): def _get_cookies(self):
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" """
Sends a session event to the API with device properties.
"""
data = [
("event_type", "session"),
("logged_in", "false"),
("number_of_retry", "1"),
("property", "model:iPhone"),
("property", "os:iOS"),
("property", "locale:en_us"),
("property", "app_build_number:4734"),
("property", "app_version:91.0"),
("property", "manufacturer:Apple"),
("property", "timestamp:2025-01-12T12:04:42-06:00"),
("property", "screen_height:852"),
("property", "os_version:16.6.1"),
("property", "source:install"),
("property", "screen_width:393"),
("property", "device_model:iPhone 14 Pro"),
("property", "brand:Apple"),
]
url = f"{self.api_url}/jobs-app/event" url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data) self.session.post(url, data=data)

View File

View File

@ -1,18 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_all():
sites = [
"indeed",
"glassdoor",
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
result = scrape_jobs(
site_name=sites,
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
), "Result should be a non-empty DataFrame"

View File

@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_glassdoor():
result = scrape_jobs(
site_name="glassdoor",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,12 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_google():
result = scrape_jobs(
site_name="google", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_indeed():
result = scrape_jobs(
site_name="indeed",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,9 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_linkedin():
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,12 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_ziprecruiter():
result = scrape_jobs(
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"