mirror of https://github.com/Bunsly/JobSpy
format: jobspy/scrapers/indeed
parent
289f626e79
commit
63790da29f
|
@ -4,12 +4,13 @@ jobspy.scrapers.indeed
|
|||
|
||||
This module contains routines to scrape Indeed.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Tuple
|
||||
|
||||
import math
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
from typing import Tuple
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
|
||||
import requests
|
||||
|
||||
|
@ -18,7 +19,7 @@ from ..utils import (
|
|||
extract_emails_from_text,
|
||||
get_enum_from_job_type,
|
||||
markdown_converter,
|
||||
logger
|
||||
logger,
|
||||
)
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
|
@ -27,7 +28,7 @@ from ...jobs import (
|
|||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
DescriptionFormat
|
||||
DescriptionFormat,
|
||||
)
|
||||
|
||||
|
||||
|
@ -57,28 +58,28 @@ class IndeedScraper(Scraper):
|
|||
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
|
||||
self.base_url = f"https://{domain}.indeed.com"
|
||||
self.headers = self.api_headers.copy()
|
||||
self.headers['indeed-co'] = self.scraper_input.country.indeed_domain_value
|
||||
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
|
||||
job_list = []
|
||||
page = 1
|
||||
|
||||
cursor = None
|
||||
offset_pages = math.ceil(self.scraper_input.offset / 100)
|
||||
for _ in range(offset_pages):
|
||||
logger.info(f'Indeed skipping search page: {page}')
|
||||
logger.info(f"Indeed skipping search page: {page}")
|
||||
__, cursor = self._scrape_page(cursor)
|
||||
if not __:
|
||||
logger.info(f'Indeed found no jobs on page: {page}')
|
||||
logger.info(f"Indeed found no jobs on page: {page}")
|
||||
break
|
||||
|
||||
while len(self.seen_urls) < scraper_input.results_wanted:
|
||||
logger.info(f'Indeed search page: {page}')
|
||||
logger.info(f"Indeed search page: {page}")
|
||||
jobs, cursor = self._scrape_page(cursor)
|
||||
if not jobs:
|
||||
logger.info(f'Indeed found no jobs on page: {page}')
|
||||
logger.info(f"Indeed found no jobs on page: {page}")
|
||||
break
|
||||
job_list += jobs
|
||||
page += 1
|
||||
return JobResponse(jobs=job_list[:scraper_input.results_wanted])
|
||||
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
|
||||
|
||||
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
|
||||
"""
|
||||
|
@ -89,31 +90,43 @@ class IndeedScraper(Scraper):
|
|||
jobs = []
|
||||
new_cursor = None
|
||||
filters = self._build_filters()
|
||||
location = (
|
||||
self.scraper_input.location
|
||||
or self.scraper_input.country.value[0].split(",")[-1]
|
||||
)
|
||||
query = self.job_search_query.format(
|
||||
what=self.scraper_input.search_term,
|
||||
location=self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
|
||||
location=location,
|
||||
radius=self.scraper_input.distance,
|
||||
dateOnIndeed=self.scraper_input.hours_old,
|
||||
cursor=f'cursor: "{cursor}"' if cursor else '',
|
||||
filters=filters
|
||||
cursor=f'cursor: "{cursor}"' if cursor else "",
|
||||
filters=filters,
|
||||
)
|
||||
payload = {
|
||||
'query': query,
|
||||
"query": query,
|
||||
}
|
||||
api_headers = self.api_headers.copy()
|
||||
api_headers['indeed-co'] = self.api_country_code
|
||||
response = requests.post(self.api_url, headers=api_headers, json=payload, proxies=self.proxy, timeout=10)
|
||||
api_headers["indeed-co"] = self.api_country_code
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
headers=api_headers,
|
||||
json=payload,
|
||||
proxies=self.proxy,
|
||||
timeout=10,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
logger.info(f'Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)')
|
||||
logger.info(
|
||||
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)"
|
||||
)
|
||||
return jobs, new_cursor
|
||||
data = response.json()
|
||||
jobs = data['data']['jobSearch']['results']
|
||||
new_cursor = data['data']['jobSearch']['pageInfo']['nextCursor']
|
||||
jobs = data["data"]["jobSearch"]["results"]
|
||||
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
|
||||
|
||||
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
||||
job_results: list[Future] = [
|
||||
executor.submit(self._process_job, job['job']) for job in jobs
|
||||
]
|
||||
executor.submit(self._process_job, job["job"]) for job in jobs
|
||||
]
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
return job_list, new_cursor
|
||||
|
||||
|
@ -131,7 +144,9 @@ class IndeedScraper(Scraper):
|
|||
start: "{start}h"
|
||||
}}
|
||||
}}
|
||||
""".format(start=self.scraper_input.hours_old)
|
||||
""".format(
|
||||
start=self.scraper_input.hours_old
|
||||
)
|
||||
elif self.scraper_input.job_type or self.scraper_input.is_remote:
|
||||
job_type_key_mapping = {
|
||||
JobType.FULL_TIME: "CF3CP",
|
||||
|
@ -174,22 +189,24 @@ class IndeedScraper(Scraper):
|
|||
if job_url in self.seen_urls:
|
||||
return
|
||||
self.seen_urls.add(job_url)
|
||||
description = job['description']['html']
|
||||
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
|
||||
description = job["description"]["html"]
|
||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
|
||||
job_type = self._get_job_type(job['attributes'])
|
||||
job_type = self._get_job_type(job["attributes"])
|
||||
timestamp_seconds = job["datePublished"] / 1000
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
||||
employer = job['employer'].get('dossier') if job['employer'] else None
|
||||
employer_details = employer.get('employerDetails', {}) if employer else {}
|
||||
employer = job["employer"].get("dossier") if job["employer"] else None
|
||||
employer_details = employer.get("employerDetails", {}) if employer else {}
|
||||
rel_url = job["employer"]["relativeCompanyPageUrl"]
|
||||
return JobPost(
|
||||
title=job["title"],
|
||||
description=description,
|
||||
company_name=job['employer'].get("name") if job.get('employer') else None,
|
||||
company_url=f"{self.base_url}{job['employer']['relativeCompanyPageUrl']}" if job[
|
||||
'employer'] else None,
|
||||
company_url_direct=employer['links']['corporateWebsite'] if employer else None,
|
||||
|
||||
company_name=job["employer"].get("name") if job.get("employer") else None,
|
||||
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
|
||||
company_url_direct=(
|
||||
employer["links"]["corporateWebsite"] if employer else None
|
||||
),
|
||||
location=Location(
|
||||
city=job.get("location", {}).get("city"),
|
||||
state=job.get("location", {}).get("admin1Code"),
|
||||
|
@ -199,20 +216,39 @@ class IndeedScraper(Scraper):
|
|||
compensation=self._get_compensation(job),
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
job_url_direct=job['recruit'].get('viewJobUrl') if job.get('recruit') else None,
|
||||
job_url_direct=(
|
||||
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
|
||||
),
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
is_remote=self._is_job_remote(job, description),
|
||||
|
||||
company_addresses=employer_details['addresses'][0] if employer_details.get('addresses') else None,
|
||||
company_industry=employer_details['industry'].replace('Iv1', '').replace('_', ' ').title() if employer_details.get('industry') else None,
|
||||
company_num_employees=employer_details.get('employeesLocalizedLabel'),
|
||||
company_revenue=employer_details.get('revenueLocalizedLabel'),
|
||||
company_description=employer_details.get('briefDescription'),
|
||||
ceo_name=employer_details.get('ceoName'),
|
||||
ceo_photo_url=employer_details.get('ceoPhotoUrl'),
|
||||
|
||||
logo_photo_url=employer['images'].get('squareLogoUrl') if employer and employer.get('images') else None,
|
||||
banner_photo_url=employer['images'].get('headerImageUrl') if employer and employer.get('images') else None,
|
||||
company_addresses=(
|
||||
employer_details["addresses"][0]
|
||||
if employer_details.get("addresses")
|
||||
else None
|
||||
),
|
||||
company_industry=(
|
||||
employer_details["industry"]
|
||||
.replace("Iv1", "")
|
||||
.replace("_", " ")
|
||||
.title()
|
||||
if employer_details.get("industry")
|
||||
else None
|
||||
),
|
||||
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
||||
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
||||
company_description=employer_details.get("briefDescription"),
|
||||
ceo_name=employer_details.get("ceoName"),
|
||||
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
|
||||
logo_photo_url=(
|
||||
employer["images"].get("squareLogoUrl")
|
||||
if employer and employer.get("images")
|
||||
else None
|
||||
),
|
||||
banner_photo_url=(
|
||||
employer["images"].get("headerImageUrl")
|
||||
if employer and employer.get("images")
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -224,7 +260,7 @@ class IndeedScraper(Scraper):
|
|||
"""
|
||||
job_types: list[JobType] = []
|
||||
for attribute in attributes:
|
||||
job_type_str = attribute['label'].replace("-", "").replace(" ", "").lower()
|
||||
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
||||
job_type = get_enum_from_job_type(job_type_str)
|
||||
if job_type:
|
||||
job_types.append(job_type)
|
||||
|
@ -238,33 +274,41 @@ class IndeedScraper(Scraper):
|
|||
:param job:
|
||||
:return: compensation object
|
||||
"""
|
||||
comp = job['compensation']['baseSalary']
|
||||
if comp:
|
||||
interval = IndeedScraper._get_compensation_interval(comp['unitOfWork'])
|
||||
if interval:
|
||||
return Compensation(
|
||||
interval=interval,
|
||||
min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None,
|
||||
max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None,
|
||||
currency=job['compensation']['currencyCode']
|
||||
)
|
||||
comp = job["compensation"]["baseSalary"]
|
||||
if not comp:
|
||||
return None
|
||||
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
|
||||
if not interval:
|
||||
return None
|
||||
min_range = comp["range"].get("min")
|
||||
max_range = comp["range"].get("max")
|
||||
return Compensation(
|
||||
interval=interval,
|
||||
min_amount=round(min_range, 2) if min_range is not None else None,
|
||||
max_amount=round(max_range, 2) if max_range is not None else None,
|
||||
currency=job["compensation"]["currencyCode"],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_job_remote(job: dict, description: str) -> bool:
|
||||
"""
|
||||
Searches the description, location, and attributes to check if job is remote
|
||||
"""
|
||||
remote_keywords = ['remote', 'work from home', 'wfh']
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
is_remote_in_attributes = any(
|
||||
any(keyword in attr['label'].lower() for keyword in remote_keywords)
|
||||
for attr in job['attributes']
|
||||
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
||||
for attr in job["attributes"]
|
||||
)
|
||||
is_remote_in_description = any(
|
||||
keyword in description.lower() for keyword in remote_keywords
|
||||
)
|
||||
is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
|
||||
is_remote_in_location = any(
|
||||
keyword in job['location']['formatted']['long'].lower()
|
||||
keyword in job["location"]["formatted"]["long"].lower()
|
||||
for keyword in remote_keywords
|
||||
)
|
||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||
return (
|
||||
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_compensation_interval(interval: str) -> CompensationInterval:
|
||||
|
@ -273,7 +317,7 @@ class IndeedScraper(Scraper):
|
|||
"YEAR": "YEARLY",
|
||||
"HOUR": "HOURLY",
|
||||
"WEEK": "WEEKLY",
|
||||
"MONTH": "MONTHLY"
|
||||
"MONTH": "MONTHLY",
|
||||
}
|
||||
mapped_interval = interval_mapping.get(interval.upper(), None)
|
||||
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
||||
|
@ -282,14 +326,14 @@ class IndeedScraper(Scraper):
|
|||
raise ValueError(f"Unsupported interval: {interval}")
|
||||
|
||||
api_headers = {
|
||||
'Host': 'apis.indeed.com',
|
||||
'content-type': 'application/json',
|
||||
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8',
|
||||
'accept': 'application/json',
|
||||
'indeed-locale': 'en-US',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1',
|
||||
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
|
||||
"Host": "apis.indeed.com",
|
||||
"content-type": "application/json",
|
||||
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
|
||||
"accept": "application/json",
|
||||
"indeed-locale": "en-US",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
|
||||
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
|
||||
}
|
||||
job_search_query = """
|
||||
query GetJobData {{
|
||||
|
|
Loading…
Reference in New Issue