format: jobspy/scrapers/utils

pull/127/head
VitaminB16 2024-03-09 19:06:25 +00:00
parent 7dcca432a2
commit 0654232db4
1 changed files with 24 additions and 19 deletions

View File

@ -1,11 +1,10 @@
from __future__ import annotations
import logging
import re
import numpy as np
import logging
import requests
import tls_client
import numpy as np
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
@ -16,7 +15,8 @@ logger.propagate = False
if not logger.handlers:
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(format)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
@ -35,7 +35,12 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
def create_session(
proxy: dict | None = None,
is_tls: bool = True,
has_retry: bool = False,
delay: int = 1,
) -> requests.Session:
"""
Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object
@ -49,15 +54,17 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
if proxy:
session.proxies.update(proxy)
if has_retry:
retries = Retry(total=3,
retries = Retry(
total=3,
connect=3,
status=3,
status_forcelist=[500, 502, 503, 504, 429],
backoff_factor=delay)
backoff_factor=delay,
)
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
@ -75,17 +82,15 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
cur_str = re.sub("[^-0-9.,]", "", cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
if "." in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
elif "," in list(cur_str[-3:]):
num = float(cur_str.replace(",", "."))
else:
num = float(cur_str)
return np.round(num, 2)