format: jobspy/scrapers/indeed

pull/127/head
VitaminB16 2024-03-09 19:48:37 +00:00
parent 289f626e79
commit 63790da29f
1 changed files with 114 additions and 70 deletions

View File

@ -4,12 +4,13 @@ jobspy.scrapers.indeed
This module contains routines to scrape Indeed. This module contains routines to scrape Indeed.
""" """
from __future__ import annotations from __future__ import annotations
from typing import Tuple
import math import math
from concurrent.futures import ThreadPoolExecutor, Future from typing import Tuple
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, Future
import requests import requests
@ -18,7 +19,7 @@ from ..utils import (
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
markdown_converter, markdown_converter,
logger logger,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -27,7 +28,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat DescriptionFormat,
) )
@ -57,24 +58,24 @@ class IndeedScraper(Scraper):
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com" self.base_url = f"https://{domain}.indeed.com"
self.headers = self.api_headers.copy() self.headers = self.api_headers.copy()
self.headers['indeed-co'] = self.scraper_input.country.indeed_domain_value self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
job_list = [] job_list = []
page = 1 page = 1
cursor = None cursor = None
offset_pages = math.ceil(self.scraper_input.offset / 100) offset_pages = math.ceil(self.scraper_input.offset / 100)
for _ in range(offset_pages): for _ in range(offset_pages):
logger.info(f'Indeed skipping search page: {page}') logger.info(f"Indeed skipping search page: {page}")
__, cursor = self._scrape_page(cursor) __, cursor = self._scrape_page(cursor)
if not __: if not __:
logger.info(f'Indeed found no jobs on page: {page}') logger.info(f"Indeed found no jobs on page: {page}")
break break
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
logger.info(f'Indeed search page: {page}') logger.info(f"Indeed search page: {page}")
jobs, cursor = self._scrape_page(cursor) jobs, cursor = self._scrape_page(cursor)
if not jobs: if not jobs:
logger.info(f'Indeed found no jobs on page: {page}') logger.info(f"Indeed found no jobs on page: {page}")
break break
job_list += jobs job_list += jobs
page += 1 page += 1
@ -89,30 +90,42 @@ class IndeedScraper(Scraper):
jobs = [] jobs = []
new_cursor = None new_cursor = None
filters = self._build_filters() filters = self._build_filters()
location = (
self.scraper_input.location
or self.scraper_input.country.value[0].split(",")[-1]
)
query = self.job_search_query.format( query = self.job_search_query.format(
what=self.scraper_input.search_term, what=self.scraper_input.search_term,
location=self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1], location=location,
radius=self.scraper_input.distance, radius=self.scraper_input.distance,
dateOnIndeed=self.scraper_input.hours_old, dateOnIndeed=self.scraper_input.hours_old,
cursor=f'cursor: "{cursor}"' if cursor else '', cursor=f'cursor: "{cursor}"' if cursor else "",
filters=filters filters=filters,
) )
payload = { payload = {
'query': query, "query": query,
} }
api_headers = self.api_headers.copy() api_headers = self.api_headers.copy()
api_headers['indeed-co'] = self.api_country_code api_headers["indeed-co"] = self.api_country_code
response = requests.post(self.api_url, headers=api_headers, json=payload, proxies=self.proxy, timeout=10) response = requests.post(
self.api_url,
headers=api_headers,
json=payload,
proxies=self.proxy,
timeout=10,
)
if response.status_code != 200: if response.status_code != 200:
logger.info(f'Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)') logger.info(
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)"
)
return jobs, new_cursor return jobs, new_cursor
data = response.json() data = response.json()
jobs = data['data']['jobSearch']['results'] jobs = data["data"]["jobSearch"]["results"]
new_cursor = data['data']['jobSearch']['pageInfo']['nextCursor'] new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
with ThreadPoolExecutor(max_workers=self.num_workers) as executor: with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(self._process_job, job['job']) for job in jobs executor.submit(self._process_job, job["job"]) for job in jobs
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
return job_list, new_cursor return job_list, new_cursor
@ -131,7 +144,9 @@ class IndeedScraper(Scraper):
start: "{start}h" start: "{start}h"
}} }}
}} }}
""".format(start=self.scraper_input.hours_old) """.format(
start=self.scraper_input.hours_old
)
elif self.scraper_input.job_type or self.scraper_input.is_remote: elif self.scraper_input.job_type or self.scraper_input.is_remote:
job_type_key_mapping = { job_type_key_mapping = {
JobType.FULL_TIME: "CF3CP", JobType.FULL_TIME: "CF3CP",
@ -174,22 +189,24 @@ class IndeedScraper(Scraper):
if job_url in self.seen_urls: if job_url in self.seen_urls:
return return
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
description = job['description']['html'] description = job["description"]["html"]
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
job_type = self._get_job_type(job['attributes']) job_type = self._get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000 timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d") date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
employer = job['employer'].get('dossier') if job['employer'] else None employer = job["employer"].get("dossier") if job["employer"] else None
employer_details = employer.get('employerDetails', {}) if employer else {} employer_details = employer.get("employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"]
return JobPost( return JobPost(
title=job["title"], title=job["title"],
description=description, description=description,
company_name=job['employer'].get("name") if job.get('employer') else None, company_name=job["employer"].get("name") if job.get("employer") else None,
company_url=f"{self.base_url}{job['employer']['relativeCompanyPageUrl']}" if job[ company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
'employer'] else None, company_url_direct=(
company_url_direct=employer['links']['corporateWebsite'] if employer else None, employer["links"]["corporateWebsite"] if employer else None
),
location=Location( location=Location(
city=job.get("location", {}).get("city"), city=job.get("location", {}).get("city"),
state=job.get("location", {}).get("admin1Code"), state=job.get("location", {}).get("admin1Code"),
@ -199,20 +216,39 @@ class IndeedScraper(Scraper):
compensation=self._get_compensation(job), compensation=self._get_compensation(job),
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_url_direct=job['recruit'].get('viewJobUrl') if job.get('recruit') else None, job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
),
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
is_remote=self._is_job_remote(job, description), is_remote=self._is_job_remote(job, description),
company_addresses=(
company_addresses=employer_details['addresses'][0] if employer_details.get('addresses') else None, employer_details["addresses"][0]
company_industry=employer_details['industry'].replace('Iv1', '').replace('_', ' ').title() if employer_details.get('industry') else None, if employer_details.get("addresses")
company_num_employees=employer_details.get('employeesLocalizedLabel'), else None
company_revenue=employer_details.get('revenueLocalizedLabel'), ),
company_description=employer_details.get('briefDescription'), company_industry=(
ceo_name=employer_details.get('ceoName'), employer_details["industry"]
ceo_photo_url=employer_details.get('ceoPhotoUrl'), .replace("Iv1", "")
.replace("_", " ")
logo_photo_url=employer['images'].get('squareLogoUrl') if employer and employer.get('images') else None, .title()
banner_photo_url=employer['images'].get('headerImageUrl') if employer and employer.get('images') else None, if employer_details.get("industry")
else None
),
company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"),
ceo_name=employer_details.get("ceoName"),
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
logo_photo_url=(
employer["images"].get("squareLogoUrl")
if employer and employer.get("images")
else None
),
banner_photo_url=(
employer["images"].get("headerImageUrl")
if employer and employer.get("images")
else None
),
) )
@staticmethod @staticmethod
@ -224,7 +260,7 @@ class IndeedScraper(Scraper):
""" """
job_types: list[JobType] = [] job_types: list[JobType] = []
for attribute in attributes: for attribute in attributes:
job_type_str = attribute['label'].replace("-", "").replace(" ", "").lower() job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str) job_type = get_enum_from_job_type(job_type_str)
if job_type: if job_type:
job_types.append(job_type) job_types.append(job_type)
@ -238,15 +274,19 @@ class IndeedScraper(Scraper):
:param job: :param job:
:return: compensation object :return: compensation object
""" """
comp = job['compensation']['baseSalary'] comp = job["compensation"]["baseSalary"]
if comp: if not comp:
interval = IndeedScraper._get_compensation_interval(comp['unitOfWork']) return None
if interval: interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
if not interval:
return None
min_range = comp["range"].get("min")
max_range = comp["range"].get("max")
return Compensation( return Compensation(
interval=interval, interval=interval,
min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None, min_amount=round(min_range, 2) if min_range is not None else None,
max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None, max_amount=round(max_range, 2) if max_range is not None else None,
currency=job['compensation']['currencyCode'] currency=job["compensation"]["currencyCode"],
) )
@staticmethod @staticmethod
@ -254,17 +294,21 @@ class IndeedScraper(Scraper):
""" """
Searches the description, location, and attributes to check if job is remote Searches the description, location, and attributes to check if job is remote
""" """
remote_keywords = ['remote', 'work from home', 'wfh'] remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any( is_remote_in_attributes = any(
any(keyword in attr['label'].lower() for keyword in remote_keywords) any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job['attributes'] for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
) )
is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
is_remote_in_location = any( is_remote_in_location = any(
keyword in job['location']['formatted']['long'].lower() keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords for keyword in remote_keywords
) )
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location return (
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
)
@staticmethod @staticmethod
def _get_compensation_interval(interval: str) -> CompensationInterval: def _get_compensation_interval(interval: str) -> CompensationInterval:
@ -273,7 +317,7 @@ class IndeedScraper(Scraper):
"YEAR": "YEARLY", "YEAR": "YEARLY",
"HOUR": "HOURLY", "HOUR": "HOURLY",
"WEEK": "WEEKLY", "WEEK": "WEEKLY",
"MONTH": "MONTHLY" "MONTH": "MONTHLY",
} }
mapped_interval = interval_mapping.get(interval.upper(), None) mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__: if mapped_interval and mapped_interval in CompensationInterval.__members__:
@ -282,14 +326,14 @@ class IndeedScraper(Scraper):
raise ValueError(f"Unsupported interval: {interval}") raise ValueError(f"Unsupported interval: {interval}")
api_headers = { api_headers = {
'Host': 'apis.indeed.com', "Host": "apis.indeed.com",
'content-type': 'application/json', "content-type": "application/json",
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8', "indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
'accept': 'application/json', "accept": "application/json",
'indeed-locale': 'en-US', "indeed-locale": "en-US",
'accept-language': 'en-US,en;q=0.9', "accept-language": "en-US,en;q=0.9",
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1', "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone', "indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
} }
job_search_query = """ job_search_query = """
query GetJobData {{ query GetJobData {{