From 63790da29ff1b69d47fa88cb74120250774d3d3e Mon Sep 17 00:00:00 2001 From: VitaminB16 Date: Sat, 9 Mar 2024 19:48:37 +0000 Subject: [PATCH] format: jobspy/scrapers/indeed --- src/jobspy/scrapers/indeed/__init__.py | 184 +++++++++++++++---------- 1 file changed, 114 insertions(+), 70 deletions(-) diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index dafd193..b745186 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -4,12 +4,13 @@ jobspy.scrapers.indeed This module contains routines to scrape Indeed. """ + from __future__ import annotations -from typing import Tuple import math -from concurrent.futures import ThreadPoolExecutor, Future +from typing import Tuple from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, Future import requests @@ -18,7 +19,7 @@ from ..utils import ( extract_emails_from_text, get_enum_from_job_type, markdown_converter, - logger + logger, ) from ...jobs import ( JobPost, @@ -27,7 +28,7 @@ from ...jobs import ( Location, JobResponse, JobType, - DescriptionFormat + DescriptionFormat, ) @@ -57,28 +58,28 @@ class IndeedScraper(Scraper): domain, self.api_country_code = self.scraper_input.country.indeed_domain_value self.base_url = f"https://{domain}.indeed.com" self.headers = self.api_headers.copy() - self.headers['indeed-co'] = self.scraper_input.country.indeed_domain_value + self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value job_list = [] page = 1 cursor = None offset_pages = math.ceil(self.scraper_input.offset / 100) for _ in range(offset_pages): - logger.info(f'Indeed skipping search page: {page}') + logger.info(f"Indeed skipping search page: {page}") __, cursor = self._scrape_page(cursor) if not __: - logger.info(f'Indeed found no jobs on page: {page}') + logger.info(f"Indeed found no jobs on page: {page}") break while len(self.seen_urls) < scraper_input.results_wanted: - logger.info(f'Indeed search page: {page}') + logger.info(f"Indeed search page: {page}") jobs, cursor = self._scrape_page(cursor) if not jobs: - logger.info(f'Indeed found no jobs on page: {page}') + logger.info(f"Indeed found no jobs on page: {page}") break job_list += jobs page += 1 - return JobResponse(jobs=job_list[:scraper_input.results_wanted]) + return JobResponse(jobs=job_list[: scraper_input.results_wanted]) def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]: """ @@ -89,31 +90,43 @@ class IndeedScraper(Scraper): jobs = [] new_cursor = None filters = self._build_filters() + location = ( + self.scraper_input.location + or self.scraper_input.country.value[0].split(",")[-1] + ) query = self.job_search_query.format( what=self.scraper_input.search_term, - location=self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1], + location=location, radius=self.scraper_input.distance, dateOnIndeed=self.scraper_input.hours_old, - cursor=f'cursor: "{cursor}"' if cursor else '', - filters=filters + cursor=f'cursor: "{cursor}"' if cursor else "", + filters=filters, ) payload = { - 'query': query, + "query": query, } api_headers = self.api_headers.copy() - api_headers['indeed-co'] = self.api_country_code - response = requests.post(self.api_url, headers=api_headers, json=payload, proxies=self.proxy, timeout=10) + api_headers["indeed-co"] = self.api_country_code + response = requests.post( + self.api_url, + headers=api_headers, + json=payload, + proxies=self.proxy, + timeout=10, + ) if response.status_code != 200: - logger.info(f'Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)') + logger.info( + f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)" + ) return jobs, new_cursor data = response.json() - jobs = data['data']['jobSearch']['results'] - new_cursor = data['data']['jobSearch']['pageInfo']['nextCursor'] + jobs = data["data"]["jobSearch"]["results"] + new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"] with ThreadPoolExecutor(max_workers=self.num_workers) as executor: job_results: list[Future] = [ - executor.submit(self._process_job, job['job']) for job in jobs - ] + executor.submit(self._process_job, job["job"]) for job in jobs + ] job_list = [result.result() for result in job_results if result.result()] return job_list, new_cursor @@ -131,7 +144,9 @@ class IndeedScraper(Scraper): start: "{start}h" }} }} - """.format(start=self.scraper_input.hours_old) + """.format( + start=self.scraper_input.hours_old + ) elif self.scraper_input.job_type or self.scraper_input.is_remote: job_type_key_mapping = { JobType.FULL_TIME: "CF3CP", @@ -174,22 +189,24 @@ class IndeedScraper(Scraper): if job_url in self.seen_urls: return self.seen_urls.add(job_url) - description = job['description']['html'] - description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description + description = job["description"]["html"] + if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: + description = markdown_converter(description) - job_type = self._get_job_type(job['attributes']) + job_type = self._get_job_type(job["attributes"]) timestamp_seconds = job["datePublished"] / 1000 date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d") - employer = job['employer'].get('dossier') if job['employer'] else None - employer_details = employer.get('employerDetails', {}) if employer else {} + employer = job["employer"].get("dossier") if job["employer"] else None + employer_details = employer.get("employerDetails", {}) if employer else {} + rel_url = job["employer"]["relativeCompanyPageUrl"] return JobPost( title=job["title"], description=description, - company_name=job['employer'].get("name") if job.get('employer') else None, - company_url=f"{self.base_url}{job['employer']['relativeCompanyPageUrl']}" if job[ - 'employer'] else None, - company_url_direct=employer['links']['corporateWebsite'] if employer else None, - + company_name=job["employer"].get("name") if job.get("employer") else None, + company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None), + company_url_direct=( + employer["links"]["corporateWebsite"] if employer else None + ), location=Location( city=job.get("location", {}).get("city"), state=job.get("location", {}).get("admin1Code"), @@ -199,20 +216,39 @@ class IndeedScraper(Scraper): compensation=self._get_compensation(job), date_posted=date_posted, job_url=job_url, - job_url_direct=job['recruit'].get('viewJobUrl') if job.get('recruit') else None, + job_url_direct=( + job["recruit"].get("viewJobUrl") if job.get("recruit") else None + ), emails=extract_emails_from_text(description) if description else None, is_remote=self._is_job_remote(job, description), - - company_addresses=employer_details['addresses'][0] if employer_details.get('addresses') else None, - company_industry=employer_details['industry'].replace('Iv1', '').replace('_', ' ').title() if employer_details.get('industry') else None, - company_num_employees=employer_details.get('employeesLocalizedLabel'), - company_revenue=employer_details.get('revenueLocalizedLabel'), - company_description=employer_details.get('briefDescription'), - ceo_name=employer_details.get('ceoName'), - ceo_photo_url=employer_details.get('ceoPhotoUrl'), - - logo_photo_url=employer['images'].get('squareLogoUrl') if employer and employer.get('images') else None, - banner_photo_url=employer['images'].get('headerImageUrl') if employer and employer.get('images') else None, + company_addresses=( + employer_details["addresses"][0] + if employer_details.get("addresses") + else None + ), + company_industry=( + employer_details["industry"] + .replace("Iv1", "") + .replace("_", " ") + .title() + if employer_details.get("industry") + else None + ), + company_num_employees=employer_details.get("employeesLocalizedLabel"), + company_revenue=employer_details.get("revenueLocalizedLabel"), + company_description=employer_details.get("briefDescription"), + ceo_name=employer_details.get("ceoName"), + ceo_photo_url=employer_details.get("ceoPhotoUrl"), + logo_photo_url=( + employer["images"].get("squareLogoUrl") + if employer and employer.get("images") + else None + ), + banner_photo_url=( + employer["images"].get("headerImageUrl") + if employer and employer.get("images") + else None + ), ) @staticmethod @@ -224,7 +260,7 @@ class IndeedScraper(Scraper): """ job_types: list[JobType] = [] for attribute in attributes: - job_type_str = attribute['label'].replace("-", "").replace(" ", "").lower() + job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower() job_type = get_enum_from_job_type(job_type_str) if job_type: job_types.append(job_type) @@ -238,33 +274,41 @@ class IndeedScraper(Scraper): :param job: :return: compensation object """ - comp = job['compensation']['baseSalary'] - if comp: - interval = IndeedScraper._get_compensation_interval(comp['unitOfWork']) - if interval: - return Compensation( - interval=interval, - min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None, - max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None, - currency=job['compensation']['currencyCode'] - ) + comp = job["compensation"]["baseSalary"] + if not comp: + return None + interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"]) + if not interval: + return None + min_range = comp["range"].get("min") + max_range = comp["range"].get("max") + return Compensation( + interval=interval, + min_amount=round(min_range, 2) if min_range is not None else None, + max_amount=round(max_range, 2) if max_range is not None else None, + currency=job["compensation"]["currencyCode"], + ) @staticmethod def _is_job_remote(job: dict, description: str) -> bool: """ Searches the description, location, and attributes to check if job is remote """ - remote_keywords = ['remote', 'work from home', 'wfh'] + remote_keywords = ["remote", "work from home", "wfh"] is_remote_in_attributes = any( - any(keyword in attr['label'].lower() for keyword in remote_keywords) - for attr in job['attributes'] + any(keyword in attr["label"].lower() for keyword in remote_keywords) + for attr in job["attributes"] + ) + is_remote_in_description = any( + keyword in description.lower() for keyword in remote_keywords ) - is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords) is_remote_in_location = any( - keyword in job['location']['formatted']['long'].lower() + keyword in job["location"]["formatted"]["long"].lower() for keyword in remote_keywords ) - return is_remote_in_attributes or is_remote_in_description or is_remote_in_location + return ( + is_remote_in_attributes or is_remote_in_description or is_remote_in_location + ) @staticmethod def _get_compensation_interval(interval: str) -> CompensationInterval: @@ -273,7 +317,7 @@ class IndeedScraper(Scraper): "YEAR": "YEARLY", "HOUR": "HOURLY", "WEEK": "WEEKLY", - "MONTH": "MONTHLY" + "MONTH": "MONTHLY", } mapped_interval = interval_mapping.get(interval.upper(), None) if mapped_interval and mapped_interval in CompensationInterval.__members__: @@ -282,14 +326,14 @@ class IndeedScraper(Scraper): raise ValueError(f"Unsupported interval: {interval}") api_headers = { - 'Host': 'apis.indeed.com', - 'content-type': 'application/json', - 'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8', - 'accept': 'application/json', - 'indeed-locale': 'en-US', - 'accept-language': 'en-US,en;q=0.9', - 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1', - 'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone', + "Host": "apis.indeed.com", + "content-type": "application/json", + "indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8", + "accept": "application/json", + "indeed-locale": "en-US", + "accept-language": "en-US,en;q=0.9", + "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1", + "indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone", } job_search_query = """ query GetJobData {{