mirror of https://github.com/Bunsly/JobSpy
format: jobspy/scrapers/utils
parent
7dcca432a2
commit
0654232db4
|
@ -1,11 +1,10 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
import requests
|
||||
import tls_client
|
||||
import numpy as np
|
||||
from markdownify import markdownify as md
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
|
||||
|
@ -16,7 +15,8 @@ logger.propagate = False
|
|||
if not logger.handlers:
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
formatter = logging.Formatter(format)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
|
@ -35,7 +35,12 @@ def extract_emails_from_text(text: str) -> list[str] | None:
|
|||
return email_regex.findall(text)
|
||||
|
||||
|
||||
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
|
||||
def create_session(
|
||||
proxy: dict | None = None,
|
||||
is_tls: bool = True,
|
||||
has_retry: bool = False,
|
||||
delay: int = 1,
|
||||
) -> requests.Session:
|
||||
"""
|
||||
Creates a requests session with optional tls, proxy, and retry settings.
|
||||
:return: A session object
|
||||
|
@ -49,15 +54,17 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
|
|||
if proxy:
|
||||
session.proxies.update(proxy)
|
||||
if has_retry:
|
||||
retries = Retry(total=3,
|
||||
retries = Retry(
|
||||
total=3,
|
||||
connect=3,
|
||||
status=3,
|
||||
status_forcelist=[500, 502, 503, 504, 429],
|
||||
backoff_factor=delay)
|
||||
backoff_factor=delay,
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retries)
|
||||
|
||||
session.mount('http://', adapter)
|
||||
session.mount('https://', adapter)
|
||||
session.mount("http://", adapter)
|
||||
session.mount("https://", adapter)
|
||||
return session
|
||||
|
||||
|
||||
|
@ -75,17 +82,15 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
|||
def currency_parser(cur_str):
|
||||
# Remove any non-numerical characters
|
||||
# except for ',' '.' or '-' (e.g. EUR)
|
||||
cur_str = re.sub("[^-0-9.,]", '', cur_str)
|
||||
cur_str = re.sub("[^-0-9.,]", "", cur_str)
|
||||
# Remove any 000s separators (either , or .)
|
||||
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
|
||||
cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
|
||||
|
||||
if '.' in list(cur_str[-3:]):
|
||||
if "." in list(cur_str[-3:]):
|
||||
num = float(cur_str)
|
||||
elif ',' in list(cur_str[-3:]):
|
||||
num = float(cur_str.replace(',', '.'))
|
||||
elif "," in list(cur_str[-3:]):
|
||||
num = float(cur_str.replace(",", "."))
|
||||
else:
|
||||
num = float(cur_str)
|
||||
|
||||
return np.round(num, 2)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue