From 184063edb62b9d076ec7b4a0a1a1741ae64bd300 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 3 Sep 2023 19:18:55 -0500 Subject: [PATCH] black format --- src/jobspy/__init__.py | 76 +++++++++++--------- src/jobspy/scrapers/indeed/__init__.py | 20 ++++-- src/jobspy/scrapers/linkedin/__init__.py | 13 +++- src/jobspy/scrapers/ziprecruiter/__init__.py | 62 +++++++++------- 4 files changed, 103 insertions(+), 68 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 3c2d7f8..6d73c4a 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -24,15 +24,14 @@ def _map_str_to_site(site_name: str) -> Site: def scrape_jobs( - site_name: str | Site | List[Site], - search_term: str, - - location: str = "", - distance: int = None, - is_remote: bool = False, - job_type: JobType = None, - easy_apply: bool = False, # linkedin - results_wanted: int = 15 + site_name: str | Site | List[Site], + search_term: str, + location: str = "", + distance: int = None, + is_remote: bool = False, + job_type: JobType = None, + easy_apply: bool = False, # linkedin + results_wanted: int = 15, ) -> pd.DataFrame: """ Asynchronously scrapes job data from multiple job sites. @@ -71,48 +70,59 @@ def scrape_jobs( for site, job_response in results.items(): for job in job_response.jobs: data = job.dict() - data['site'] = site + data["site"] = site # Formatting JobType - data['job_type'] = data['job_type'].value if data['job_type'] else None + data["job_type"] = data["job_type"].value if data["job_type"] else None # Formatting Location - location_obj = data.get('location') + location_obj = data.get("location") if location_obj and isinstance(location_obj, dict): - data['city'] = location_obj.get('city', '') - data['state'] = location_obj.get('state', '') - data['country'] = location_obj.get('country', 'USA') + data["city"] = location_obj.get("city", "") + data["state"] = location_obj.get("state", "") + data["country"] = location_obj.get("country", "USA") else: - data['city'] = None - data['state'] = None - data['country'] = None + data["city"] = None + data["state"] = None + data["country"] = None # Formatting Compensation - compensation_obj = data.get('compensation') + compensation_obj = data.get("compensation") if compensation_obj and isinstance(compensation_obj, dict): - data['interval'] = compensation_obj.get('interval').value if compensation_obj.get('interval') else None - data['min_amount'] = compensation_obj.get('min_amount') - data['max_amount'] = compensation_obj.get('max_amount') - data['currency'] = compensation_obj.get('currency', 'USD') + data["interval"] = ( + compensation_obj.get("interval").value + if compensation_obj.get("interval") + else None + ) + data["min_amount"] = compensation_obj.get("min_amount") + data["max_amount"] = compensation_obj.get("max_amount") + data["currency"] = compensation_obj.get("currency", "USD") else: - data['interval'] = None - data['min_amount'] = None - data['max_amount'] = None - data['currency'] = None + data["interval"] = None + data["min_amount"] = None + data["max_amount"] = None + data["currency"] = None job_df = pd.DataFrame([data]) dfs.append(job_df) if dfs: df = pd.concat(dfs, ignore_index=True) - desired_order = ['site', 'title', 'company_name', 'city', 'state','job_type', - 'interval', 'min_amount', 'max_amount', 'job_url', 'description',] + desired_order = [ + "site", + "title", + "company_name", + "city", + "state", + "job_type", + "interval", + "min_amount", + "max_amount", + "job_url", + "description", + ] df = df[desired_order] else: df = pd.DataFrame() return df - - - - diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index b67be0a..fba4c8f 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -10,7 +10,14 @@ from bs4 import BeautifulSoup from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor, Future -from src.jobspy.jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType +from src.jobspy.jobs import ( + JobPost, + Compensation, + CompensationInterval, + Location, + JobResponse, + JobType, +) from .. import Scraper, ScraperInput, Site, StatusException @@ -60,10 +67,7 @@ class IndeedScraper(Scraper): params["sc"] = "0kf:" + "".join(sc_values) + ";" response = session.get(self.url + "/jobs", params=params) - if ( - response.status_code != 200 - and response.status_code != 307 - ): + if response.status_code != 200 and response.status_code != 307: raise StatusException(response.status_code) soup = BeautifulSoup(response.content, "html.parser") @@ -135,8 +139,10 @@ class IndeedScraper(Scraper): return job_post with ThreadPoolExecutor(max_workers=10) as executor: - job_results: list[Future] = [executor.submit(process_job, job) for job in - jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]] + job_results: list[Future] = [ + executor.submit(process_job, job) + for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] + ] job_list = [result.result() for result in job_results if result.result()] diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index d36ab9d..0855bc3 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -6,7 +6,14 @@ from bs4 import BeautifulSoup from bs4.element import Tag from .. import Scraper, ScraperInput, Site -from src.jobspy.jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval +from src.jobspy.jobs import ( + JobPost, + Location, + JobResponse, + JobType, + Compensation, + CompensationInterval, +) class LinkedInScraper(Scraper): @@ -117,7 +124,9 @@ class LinkedInScraper(Scraper): date_posted=date_posted, job_url=job_url, job_type=job_type, - compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD") + compensation=Compensation( + interval=CompensationInterval.YEARLY, currency="USD" + ), ) job_list.append(job_post) if ( diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 592d273..b8ec444 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -11,7 +11,14 @@ from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor, Future from .. import Scraper, ScraperInput, Site, StatusException -from src.jobspy.jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType +from src.jobspy.jobs import ( + JobPost, + Compensation, + CompensationInterval, + Location, + JobResponse, + JobType, +) class ZipRecruiterScraper(Scraper): @@ -55,7 +62,7 @@ class ZipRecruiterScraper(Scraper): "search": scraper_input.search_term, "location": scraper_input.location, "page": page, - "form": "jobs-landing" + "form": "jobs-landing", } if scraper_input.is_remote: @@ -65,7 +72,9 @@ class ZipRecruiterScraper(Scraper): params["radius"] = scraper_input.distance if job_type_value: - params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}" + params[ + "refine_by_employment" + ] = f"employment_type:employment_type:{job_type_value}" response = self.session.get( self.url + "/jobs-search", @@ -90,11 +99,14 @@ class ZipRecruiterScraper(Scraper): with ThreadPoolExecutor(max_workers=10) as executor: if "jobList" in data and data["jobList"]: jobs_js = data["jobList"] - job_results = [executor.submit(self.process_job_js, job) for job in jobs_js] + job_results = [ + executor.submit(self.process_job_js, job) for job in jobs_js + ] else: jobs_html = soup.find_all("div", {"class": "job_content"}) - job_results = [executor.submit(self.process_job_html, job) for job in - jobs_html] + job_results = [ + executor.submit(self.process_job_html, job) for job in jobs_html + ] job_list = [result.result() for result in job_results if result.result()] @@ -107,8 +119,9 @@ class ZipRecruiterScraper(Scraper): :return: job_response """ - - pages_to_process = max(3, math.ceil(scraper_input.results_wanted / self.jobs_per_page)) + pages_to_process = max( + 3, math.ceil(scraper_input.results_wanted / self.jobs_per_page) + ) try: #: get first page to initialize session @@ -125,7 +138,6 @@ class ZipRecruiterScraper(Scraper): job_list += jobs - except StatusException as e: return JobResponse( success=False, @@ -162,9 +174,7 @@ class ZipRecruiterScraper(Scraper): title = job.find("h2", {"class": "title"}).text company = job.find("a", {"class": "company_name"}).text.strip() - description, updated_job_url = self.get_description( - job_url - ) + description, updated_job_url = self.get_description(job_url) if updated_job_url is not None: job_url = updated_job_url if description is None: @@ -173,10 +183,7 @@ class ZipRecruiterScraper(Scraper): job_type_element = job.find("li", {"class": "perk_item perk_type"}) if job_type_element: job_type_text = ( - job_type_element.text.strip() - .lower() - .replace("-", "") - .replace(" ", "") + job_type_element.text.strip().lower().replace("-", "").replace(" ", "") ) if job_type_text == "contractor": job_type_text = "contract" @@ -201,12 +208,16 @@ class ZipRecruiterScraper(Scraper): def process_job_js(self, job: dict) -> JobPost: # Map the job data to the expected fields by the Pydantic model title = job.get("Title") - description = BeautifulSoup(job.get("Snippet","").strip(), "html.parser").get_text() + description = BeautifulSoup( + job.get("Snippet", "").strip(), "html.parser" + ).get_text() company = job.get("OrgName") location = Location(city=job.get("City"), state=job.get("State")) try: - job_type = ZipRecruiterScraper.job_type_from_string(job.get("EmploymentType", "").replace("-", "_").lower()) + job_type = ZipRecruiterScraper.job_type_from_string( + job.get("EmploymentType", "").replace("-", "_").lower() + ) except ValueError: # print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}") return None @@ -215,14 +226,14 @@ class ZipRecruiterScraper(Scraper): salary_parts = formatted_salary.split(" ") min_salary_str = salary_parts[0][1:].replace(",", "") - if '.' in min_salary_str: + if "." in min_salary_str: min_amount = int(float(min_salary_str) * 1000) else: min_amount = int(min_salary_str.replace("K", "000")) if len(salary_parts) >= 3 and salary_parts[2].startswith("$"): max_salary_str = salary_parts[2][1:].replace(",", "") - if '.' in max_salary_str: + if "." in max_salary_str: max_amount = int(float(max_salary_str) * 1000) else: max_amount = int(max_salary_str.replace("K", "000")) @@ -232,10 +243,12 @@ class ZipRecruiterScraper(Scraper): compensation = Compensation( interval=CompensationInterval.YEARLY, min_amount=min_amount, - max_amount=max_amount + max_amount=max_amount, ) save_job_url = job.get("SaveJobURL", "") - posted_time_match = re.search(r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url) + posted_time_match = re.search( + r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url + ) if posted_time_match: date_time_str = posted_time_match.group(1) date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ") @@ -269,10 +282,7 @@ class ZipRecruiterScraper(Scraper): return item raise ValueError(f"Invalid value for JobType: {value}") - def get_description( - self, - job_page_url: str - ) -> Tuple[Optional[str], Optional[str]]: + def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]: """ Retrieves job description by going to the job page url :param job_page_url: