enh:remove log by default

main
Cullen Watson 2025-02-21 12:29:28 -06:00
parent 11a9e9a56a
commit 81ed9b3ddf
17 changed files with 90 additions and 159 deletions

View File

@ -42,7 +42,7 @@ def scrape_jobs(
offset: int | None = 0,
hours_old: int = None,
enforce_annual_salary: bool = False,
verbose: int = 2,
verbose: int = 0,
**kwargs,
) -> pd.DataFrame:
"""

View File

@ -1,19 +1,22 @@
"""
jobspy.scrapers.bayt
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Bayt.
"""
from __future__ import annotations
import time
import random
from typing import Optional
import time
import requests
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site
from ..exceptions import BaytException
from ..utils import create_logger, create_session
from ...jobs import JobPost, JobResponse, Location, Country
from ..utils import create_logger
logger = create_logger("Bayt")
logger.setLevel("DEBUG") # Ensure DEBUG messages are output
log = create_logger("Bayt")
class BaytScraper(Scraper):
@ -26,10 +29,14 @@ class BaytScraper(Scraper):
):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None
self.session = None
self.country = "worldwide"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input = scraper_input
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
job_list: list[JobPost] = []
page = 1
results_wanted = (
@ -37,13 +44,15 @@ class BaytScraper(Scraper):
)
while len(job_list) < results_wanted:
logger.info(f"Fetching Bayt jobs page {page}")
log.info(f"Fetching Bayt jobs page {page}")
job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
if not job_elements:
break
if job_elements:
logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500])
log.debug(
"First job element snippet:\n" + job_elements[0].prettify()[:500]
)
initial_count = len(job_list)
for job in job_elements:
@ -54,16 +63,16 @@ class BaytScraper(Scraper):
if len(job_list) >= results_wanted:
break
else:
logger.debug(
log.debug(
"Extraction returned None. Job snippet:\n"
+ job.prettify()[:500]
)
except Exception as e:
logger.error(f"Bayt: Error extracting job info: {str(e)}")
log.error(f"Bayt: Error extracting job info: {str(e)}")
continue
if len(job_list) == initial_count:
logger.info(f"No new jobs found on page {page}. Ending pagination.")
log.info(f"No new jobs found on page {page}. Ending pagination.")
break
page += 1
@ -72,45 +81,35 @@ class BaytScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]:
def _fetch_jobs(self, query: str, page: int) -> list | None:
"""
Grabs the job results for the given query and page number.
"""
try:
# Updated URL to include the "international" segment as per the original code.
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
logger.info(f"Constructed URL: {url}")
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
response = requests.get(url, headers=headers, timeout=10)
response = self.session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Use the attribute selector as in the original code.
job_listings = soup.find_all("li", attrs={"data-js-job": ""})
logger.info(f"Found {len(job_listings)} job listing elements")
log.debug(f"Found {len(job_listings)} job listing elements")
return job_listings
except Exception as e:
logger.error(f"Bayt: Error fetching jobs - {str(e)}")
log.error(f"Bayt: Error fetching jobs - {str(e)}")
return None
def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]:
def _extract_job_info(self, job: BeautifulSoup) -> JobPost | None:
"""
Extracts the job information from a single job listing.
"""
# Find the h2 element holding the title and link (no class filtering)
job_general_information = job.find("h2")
if not job_general_information:
return None
return
job_title = job_general_information.get_text(strip=True)
job_url = self._extract_job_url(job_general_information)
if not job_url:
return None
return
# Extract company name using the original approach:
company_tag = job.find("div", class_="t-nowrap p10l")
@ -129,31 +128,18 @@ class BaytScraper(Scraper):
city=location,
country=Country.from_string(self.country),
)
return JobPost(
id=job_id,
title=job_title,
company_name=company_name,
company_url="",
location=location_obj,
date_posted=None,
job_url=job_url,
compensation=None,
job_type=None,
job_level=None,
company_industry=None,
description=None,
job_url_direct=None,
emails=[],
company_logo=None,
job_function=None,
)
def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]:
def _extract_job_url(self, job_general_information: BeautifulSoup) -> str | None:
"""
Pulls the job URL from the 'a' within the h2 element.
"""
a_tag = job_general_information.find("a")
if a_tag and a_tag.has_attr("href"):
return self.base_url + a_tag["href"].strip()
return None

View File

@ -30,6 +30,7 @@ class GoogleJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Google Jobs")
class BaytException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Bayt")

View File

@ -32,7 +32,7 @@ from ...jobs import (
DescriptionFormat,
)
logger = create_logger("Glassdoor")
log = create_logger("Glassdoor")
class GlassdoorScraper(Scraper):
@ -64,7 +64,7 @@ class GlassdoorScraper(Scraper):
self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
)
token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token
@ -74,7 +74,7 @@ class GlassdoorScraper(Scraper):
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
logger.error("Glassdoor: location not parsed")
log.error("Glassdoor: location not parsed")
return JobResponse(jobs=[])
job_list: list[JobPost] = []
cursor = None
@ -83,7 +83,7 @@ class GlassdoorScraper(Scraper):
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end):
logger.info(f"search page: {page} / {range_end-1}")
log.info(f"search page: {page} / {range_end - 1}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
@ -93,7 +93,7 @@ class GlassdoorScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted]
break
except Exception as e:
logger.error(f"Glassdoor: {str(e)}")
log.error(f"Glassdoor: {str(e)}")
break
return JobResponse(jobs=job_list)
@ -129,7 +129,7 @@ class GlassdoorScraper(Scraper):
ValueError,
Exception,
) as e:
logger.error(f"Glassdoor: {str(e)}")
log.error(f"Glassdoor: {str(e)}")
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"]
@ -264,12 +264,12 @@ class GlassdoorScraper(Scraper):
if res.status_code != 200:
if res.status_code == 429:
err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err)
log.error(err)
return None, None
else:
err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}")
log.error(f"Glassdoor response status code {res.status_code}")
return None, None
items = res.json()

View File

@ -26,7 +26,7 @@ from ...jobs import (
JobType,
)
logger = create_logger("Google")
log = create_logger("Google")
class GoogleJobsScraper(Scraper):
@ -61,7 +61,7 @@ class GoogleJobsScraper(Scraper):
)
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None:
logger.warning(
log.warning(
"initial cursor not found, try changing your query or there was at most 10 results"
)
return JobResponse(jobs=job_list)
@ -72,16 +72,16 @@ class GoogleJobsScraper(Scraper):
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
and forward_cursor
):
logger.info(
log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e:
logger.error(f"failed to get jobs on page: {page}, {e}")
log.error(f"failed to get jobs on page: {page}, {e}")
break
if not jobs:
logger.info(f"found no jobs on page: {page}")
log.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
@ -230,10 +230,7 @@ class GoogleJobsScraper(Scraper):
@staticmethod
def _find_job_info_initial_page(html_text: str):
pattern = (
f'520084652":('
+ r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
)
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
results = []
matches = re.finditer(pattern, html_text)
@ -245,6 +242,6 @@ class GoogleJobsScraper(Scraper):
results.append(parsed_data)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse match: {str(e)}")
log.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results

View File

@ -30,7 +30,7 @@ from ...jobs import (
DescriptionFormat,
)
logger = create_logger("Indeed")
log = create_logger("Indeed")
class IndeedScraper(Scraper):
@ -71,12 +71,12 @@ class IndeedScraper(Scraper):
cursor = None
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor)
if not jobs:
logger.info(f"found no jobs on page: {page}")
log.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
@ -122,9 +122,10 @@ class IndeedScraper(Scraper):
headers=api_headers_temp,
json=payload,
timeout=10,
verify=False,
)
if not response.ok:
logger.info(
log.info(
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
)
return jobs, new_cursor

View File

@ -38,7 +38,7 @@ from ..utils import (
markdown_converter,
)
logger = create_logger("LinkedIn")
log = create_logger("LinkedIn")
class LinkedInScraper(Scraper):
@ -86,7 +86,7 @@ class LinkedInScraper(Scraper):
)
while continue_search():
request_count += 1
logger.info(
log.info(
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
)
params = {
@ -126,13 +126,13 @@ class LinkedInScraper(Scraper):
else:
err = f"LinkedIn response status code {response.status_code}"
err += f" - {response.text}"
logger.error(err)
log.error(err)
return JobResponse(jobs=job_list)
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"LinkedIn: Bad proxy")
log.error(f"LinkedIn: Bad proxy")
else:
logger.error(f"LinkedIn: {str(e)}")
log.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser")

View File

@ -1,17 +1,20 @@
from __future__ import annotations
import re
import logging
import re
from itertools import cycle
import numpy as np
import requests
import tls_client
import numpy as np
import urllib3
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def create_logger(name: str):
logger = logging.getLogger(f"JobSpy:{name}")
@ -129,7 +132,7 @@ def create_session(
return session
def set_logger_level(verbose: int = 2):
def set_logger_level(verbose: int):
"""
Adjusts the logger's level. This function allows the logging level to be changed at runtime.

View File

@ -11,11 +11,10 @@ import json
import math
import re
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from typing import Optional, Tuple, Any
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from .constants import headers
@ -37,7 +36,7 @@ from ...jobs import (
DescriptionFormat,
)
logger = create_logger("ZipRecruiter")
log = create_logger("ZipRecruiter")
class ZipRecruiterScraper(Scraper):
@ -77,7 +76,7 @@ class ZipRecruiterScraper(Scraper):
break
if page > 1:
time.sleep(self.delay)
logger.info(f"search page: {page} / {max_pages}")
log.info(f"search page: {page} / {max_pages}")
jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token
)
@ -110,13 +109,13 @@ class ZipRecruiterScraper(Scraper):
else:
err = f"ZipRecruiter response status code {res.status_code}"
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
logger.error(err)
log.error(err)
return jobs_list, ""
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"Indeed: Bad proxy")
log.error(f"Indeed: Bad proxy")
else:
logger.error(f"Indeed: {str(e)}")
log.error(f"Indeed: {str(e)}")
return jobs_list, ""
res_data = res.json()
@ -215,7 +214,28 @@ class ZipRecruiterScraper(Scraper):
return description_full, job_url_direct
def _get_cookies(self):
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
"""
Sends a session event to the API with device properties.
"""
data = [
("event_type", "session"),
("logged_in", "false"),
("number_of_retry", "1"),
("property", "model:iPhone"),
("property", "os:iOS"),
("property", "locale:en_us"),
("property", "app_build_number:4734"),
("property", "app_version:91.0"),
("property", "manufacturer:Apple"),
("property", "timestamp:2025-01-12T12:04:42-06:00"),
("property", "screen_height:852"),
("property", "os_version:16.6.1"),
("property", "source:install"),
("property", "screen_width:393"),
("property", "device_model:iPhone 14 Pro"),
("property", "brand:Apple"),
]
url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data)

View File

View File

@ -1,18 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_all():
sites = [
"indeed",
"glassdoor",
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
result = scrape_jobs(
site_name=sites,
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
), "Result should be a non-empty DataFrame"

View File

@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_glassdoor():
result = scrape_jobs(
site_name="glassdoor",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,12 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_google():
result = scrape_jobs(
site_name="google", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_indeed():
result = scrape_jobs(
site_name="indeed",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,9 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_linkedin():
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@ -1,12 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_ziprecruiter():
result = scrape_jobs(
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"