[chore] format

pull/58/head
Cullen Watson 2023-10-18 14:22:59 -05:00
parent bb21f47537
commit 28dba8b16e
3 changed files with 31 additions and 16 deletions

View File

@ -37,15 +37,16 @@ class JobType(Enum):
"повназайнятість", "повназайнятість",
"toànthờigian", "toànthờigian",
) )
PART_TIME = ( PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
"parttime",
"teilzeit",
"částečnýúvazek",
"deltid"
)
CONTRACT = ("contract", "contractor") CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",) TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum", "praktik") INTERNSHIP = (
"internship",
"prácticas",
"ojt(onthejobtraining)",
"praktikum",
"praktik",
)
PER_DIEM = ("perdiem",) PER_DIEM = ("perdiem",)
NIGHTS = ("nights",) NIGHTS = ("nights",)

View File

@ -16,7 +16,12 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session, get_enum_from_job_type from ..utils import (
count_urgent_words,
extract_emails_from_text,
create_session,
get_enum_from_job_type,
)
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -165,8 +170,7 @@ class IndeedScraper(Scraper):
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor: with ThreadPoolExecutor(max_workers=1) as executor:
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(process_job, job) executor.submit(process_job, job) for job in jobs
for job in jobs
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
@ -231,14 +235,16 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None return None
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find('script', text=lambda x: x and 'window._initialData' in x) script_tag = soup.find(
"script", text=lambda x: x and "window._initialData" in x
)
if not script_tag: if not script_tag:
return None return None
script_code = script_tag.string script_code = script_tag.string
match = re.search(r'window\._initialData\s*=\s*({.*?})\s*;', script_code, re.S) match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match: if not match:
return None return None
@ -246,12 +252,18 @@ class IndeedScraper(Scraper):
json_string = match.group(1) json_string = match.group(1)
data = json.loads(json_string) data = json.loads(json_string)
try: try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"] job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError): except (KeyError, TypeError, IndexError):
return None return None
soup = BeautifulSoup(job_description, "html.parser") # No need for StringIO, pass the string directly soup = BeautifulSoup(
text_content = " ".join(soup.get_text(separator=" ").split()).strip() # Clean and normalize whitespaces job_description, "html.parser"
)
text_content = " ".join(
soup.get_text(separator=" ").split()
).strip()
return text_content return text_content
@ -297,6 +309,7 @@ class IndeedScraper(Scraper):
): ):
return tag return tag
return None return None
script_tag = find_mosaic_script() script_tag = find_mosaic_script()
if script_tag: if script_tag:

View File

@ -44,6 +44,7 @@ def create_session(proxy: str | None = None):
return session return session
def get_enum_from_job_type(job_type_str: str) -> JobType | None: def get_enum_from_job_type(job_type_str: str) -> JobType | None:
""" """
Given a string, returns the corresponding JobType enum member if a match is found. Given a string, returns the corresponding JobType enum member if a match is found.