From 28dba8b16e40f2634096ec1df96cadd347c3d6b9 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 18 Oct 2023 14:22:59 -0500 Subject: [PATCH] [chore] format --- src/jobspy/jobs/__init__.py | 15 +++++++------ src/jobspy/scrapers/indeed/__init__.py | 31 ++++++++++++++++++-------- src/jobspy/scrapers/utils.py | 1 + 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index e44e885..eec40a1 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -37,15 +37,16 @@ class JobType(Enum): "повназайнятість", "toànthờigian", ) - PART_TIME = ( - "parttime", - "teilzeit", - "částečnýúvazek", - "deltid" - ) + PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid") CONTRACT = ("contract", "contractor") TEMPORARY = ("temporary",) - INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum", "praktik") + INTERNSHIP = ( + "internship", + "prácticas", + "ojt(onthejobtraining)", + "praktikum", + "praktik", + ) PER_DIEM = ("perdiem",) NIGHTS = ("nights",) diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 0397ab3..3800221 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -16,7 +16,12 @@ from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor, Future from ..exceptions import IndeedException -from ..utils import count_urgent_words, extract_emails_from_text, create_session, get_enum_from_job_type +from ..utils import ( + count_urgent_words, + extract_emails_from_text, + create_session, + get_enum_from_job_type, +) from ...jobs import ( JobPost, Compensation, @@ -165,8 +170,7 @@ class IndeedScraper(Scraper): jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] with ThreadPoolExecutor(max_workers=1) as executor: job_results: list[Future] = [ - executor.submit(process_job, job) - for job in jobs + executor.submit(process_job, job) for job in jobs ] job_list = [result.result() for result in job_results if result.result()] @@ -231,14 +235,16 @@ class IndeedScraper(Scraper): if response.status_code not in range(200, 400): return None - soup = BeautifulSoup(response.text, 'html.parser') - script_tag = soup.find('script', text=lambda x: x and 'window._initialData' in x) + soup = BeautifulSoup(response.text, "html.parser") + script_tag = soup.find( + "script", text=lambda x: x and "window._initialData" in x + ) if not script_tag: return None script_code = script_tag.string - match = re.search(r'window\._initialData\s*=\s*({.*?})\s*;', script_code, re.S) + match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S) if not match: return None @@ -246,12 +252,18 @@ class IndeedScraper(Scraper): json_string = match.group(1) data = json.loads(json_string) try: - job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"] + job_description = data["jobInfoWrapperModel"]["jobInfoModel"][ + "sanitizedJobDescription" + ] except (KeyError, TypeError, IndexError): return None - soup = BeautifulSoup(job_description, "html.parser") # No need for StringIO, pass the string directly - text_content = " ".join(soup.get_text(separator=" ").split()).strip() # Clean and normalize whitespaces + soup = BeautifulSoup( + job_description, "html.parser" + ) + text_content = " ".join( + soup.get_text(separator=" ").split() + ).strip() return text_content @@ -297,6 +309,7 @@ class IndeedScraper(Scraper): ): return tag return None + script_tag = find_mosaic_script() if script_tag: diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 6b6baec..f34a48d 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -44,6 +44,7 @@ def create_session(proxy: str | None = None): return session + def get_enum_from_job_type(job_type_str: str) -> JobType | None: """ Given a string, returns the corresponding JobType enum member if a match is found.