mirror of https://github.com/Bunsly/JobSpy
[chore] format
parent
bb21f47537
commit
28dba8b16e
|
@ -37,15 +37,16 @@ class JobType(Enum):
|
||||||
"повназайнятість",
|
"повназайнятість",
|
||||||
"toànthờigian",
|
"toànthờigian",
|
||||||
)
|
)
|
||||||
PART_TIME = (
|
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
|
||||||
"parttime",
|
|
||||||
"teilzeit",
|
|
||||||
"částečnýúvazek",
|
|
||||||
"deltid"
|
|
||||||
)
|
|
||||||
CONTRACT = ("contract", "contractor")
|
CONTRACT = ("contract", "contractor")
|
||||||
TEMPORARY = ("temporary",)
|
TEMPORARY = ("temporary",)
|
||||||
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum", "praktik")
|
INTERNSHIP = (
|
||||||
|
"internship",
|
||||||
|
"prácticas",
|
||||||
|
"ojt(onthejobtraining)",
|
||||||
|
"praktikum",
|
||||||
|
"praktik",
|
||||||
|
)
|
||||||
|
|
||||||
PER_DIEM = ("perdiem",)
|
PER_DIEM = ("perdiem",)
|
||||||
NIGHTS = ("nights",)
|
NIGHTS = ("nights",)
|
||||||
|
|
|
@ -16,7 +16,12 @@ from bs4.element import Tag
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
from ..exceptions import IndeedException
|
from ..exceptions import IndeedException
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session, get_enum_from_job_type
|
from ..utils import (
|
||||||
|
count_urgent_words,
|
||||||
|
extract_emails_from_text,
|
||||||
|
create_session,
|
||||||
|
get_enum_from_job_type,
|
||||||
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Compensation,
|
Compensation,
|
||||||
|
@ -165,8 +170,7 @@ class IndeedScraper(Scraper):
|
||||||
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||||
job_results: list[Future] = [
|
job_results: list[Future] = [
|
||||||
executor.submit(process_job, job)
|
executor.submit(process_job, job) for job in jobs
|
||||||
for job in jobs
|
|
||||||
]
|
]
|
||||||
|
|
||||||
job_list = [result.result() for result in job_results if result.result()]
|
job_list = [result.result() for result in job_results if result.result()]
|
||||||
|
@ -231,14 +235,16 @@ class IndeedScraper(Scraper):
|
||||||
if response.status_code not in range(200, 400):
|
if response.status_code not in range(200, 400):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
script_tag = soup.find('script', text=lambda x: x and 'window._initialData' in x)
|
script_tag = soup.find(
|
||||||
|
"script", text=lambda x: x and "window._initialData" in x
|
||||||
|
)
|
||||||
|
|
||||||
if not script_tag:
|
if not script_tag:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
script_code = script_tag.string
|
script_code = script_tag.string
|
||||||
match = re.search(r'window\._initialData\s*=\s*({.*?})\s*;', script_code, re.S)
|
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
|
||||||
|
|
||||||
if not match:
|
if not match:
|
||||||
return None
|
return None
|
||||||
|
@ -246,12 +252,18 @@ class IndeedScraper(Scraper):
|
||||||
json_string = match.group(1)
|
json_string = match.group(1)
|
||||||
data = json.loads(json_string)
|
data = json.loads(json_string)
|
||||||
try:
|
try:
|
||||||
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
|
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
|
||||||
|
"sanitizedJobDescription"
|
||||||
|
]
|
||||||
except (KeyError, TypeError, IndexError):
|
except (KeyError, TypeError, IndexError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(job_description, "html.parser") # No need for StringIO, pass the string directly
|
soup = BeautifulSoup(
|
||||||
text_content = " ".join(soup.get_text(separator=" ").split()).strip() # Clean and normalize whitespaces
|
job_description, "html.parser"
|
||||||
|
)
|
||||||
|
text_content = " ".join(
|
||||||
|
soup.get_text(separator=" ").split()
|
||||||
|
).strip()
|
||||||
|
|
||||||
return text_content
|
return text_content
|
||||||
|
|
||||||
|
@ -297,6 +309,7 @@ class IndeedScraper(Scraper):
|
||||||
):
|
):
|
||||||
return tag
|
return tag
|
||||||
return None
|
return None
|
||||||
|
|
||||||
script_tag = find_mosaic_script()
|
script_tag = find_mosaic_script()
|
||||||
|
|
||||||
if script_tag:
|
if script_tag:
|
||||||
|
|
|
@ -44,6 +44,7 @@ def create_session(proxy: str | None = None):
|
||||||
|
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
||||||
"""
|
"""
|
||||||
Given a string, returns the corresponding JobType enum member if a match is found.
|
Given a string, returns the corresponding JobType enum member if a match is found.
|
||||||
|
|
Loading…
Reference in New Issue