mirror of https://github.com/Bunsly/JobSpy
format: jobspy/scrapers/utils
parent
7dcca432a2
commit
0654232db4
|
@ -1,11 +1,10 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
import numpy as np
|
|
||||||
import requests
|
import requests
|
||||||
import tls_client
|
import tls_client
|
||||||
|
import numpy as np
|
||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
from requests.adapters import HTTPAdapter, Retry
|
from requests.adapters import HTTPAdapter, Retry
|
||||||
|
|
||||||
|
@ -16,7 +15,8 @@ logger.propagate = False
|
||||||
if not logger.handlers:
|
if not logger.handlers:
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
console_handler = logging.StreamHandler()
|
console_handler = logging.StreamHandler()
|
||||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
formatter = logging.Formatter(format)
|
||||||
console_handler.setFormatter(formatter)
|
console_handler.setFormatter(formatter)
|
||||||
logger.addHandler(console_handler)
|
logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
@ -35,7 +35,12 @@ def extract_emails_from_text(text: str) -> list[str] | None:
|
||||||
return email_regex.findall(text)
|
return email_regex.findall(text)
|
||||||
|
|
||||||
|
|
||||||
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
|
def create_session(
|
||||||
|
proxy: dict | None = None,
|
||||||
|
is_tls: bool = True,
|
||||||
|
has_retry: bool = False,
|
||||||
|
delay: int = 1,
|
||||||
|
) -> requests.Session:
|
||||||
"""
|
"""
|
||||||
Creates a requests session with optional tls, proxy, and retry settings.
|
Creates a requests session with optional tls, proxy, and retry settings.
|
||||||
:return: A session object
|
:return: A session object
|
||||||
|
@ -49,15 +54,17 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
|
||||||
if proxy:
|
if proxy:
|
||||||
session.proxies.update(proxy)
|
session.proxies.update(proxy)
|
||||||
if has_retry:
|
if has_retry:
|
||||||
retries = Retry(total=3,
|
retries = Retry(
|
||||||
|
total=3,
|
||||||
connect=3,
|
connect=3,
|
||||||
status=3,
|
status=3,
|
||||||
status_forcelist=[500, 502, 503, 504, 429],
|
status_forcelist=[500, 502, 503, 504, 429],
|
||||||
backoff_factor=delay)
|
backoff_factor=delay,
|
||||||
|
)
|
||||||
adapter = HTTPAdapter(max_retries=retries)
|
adapter = HTTPAdapter(max_retries=retries)
|
||||||
|
|
||||||
session.mount('http://', adapter)
|
session.mount("http://", adapter)
|
||||||
session.mount('https://', adapter)
|
session.mount("https://", adapter)
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,17 +82,15 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
||||||
def currency_parser(cur_str):
|
def currency_parser(cur_str):
|
||||||
# Remove any non-numerical characters
|
# Remove any non-numerical characters
|
||||||
# except for ',' '.' or '-' (e.g. EUR)
|
# except for ',' '.' or '-' (e.g. EUR)
|
||||||
cur_str = re.sub("[^-0-9.,]", '', cur_str)
|
cur_str = re.sub("[^-0-9.,]", "", cur_str)
|
||||||
# Remove any 000s separators (either , or .)
|
# Remove any 000s separators (either , or .)
|
||||||
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
|
cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
|
||||||
|
|
||||||
if '.' in list(cur_str[-3:]):
|
if "." in list(cur_str[-3:]):
|
||||||
num = float(cur_str)
|
num = float(cur_str)
|
||||||
elif ',' in list(cur_str[-3:]):
|
elif "," in list(cur_str[-3:]):
|
||||||
num = float(cur_str.replace(',', '.'))
|
num = float(cur_str.replace(",", "."))
|
||||||
else:
|
else:
|
||||||
num = float(cur_str)
|
num = float(cur_str)
|
||||||
|
|
||||||
return np.round(num, 2)
|
return np.round(num, 2)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue