mirror of https://github.com/Bunsly/JobSpy
436 lines
15 KiB
Python
436 lines
15 KiB
Python
"""
|
|
jobspy.scrapers.indeed
|
|
~~~~~~~~~~~~~~~~~~~
|
|
|
|
This module contains routines to scrape Indeed.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
from typing import Tuple
|
|
from datetime import datetime
|
|
from concurrent.futures import ThreadPoolExecutor, Future
|
|
|
|
import requests
|
|
|
|
from .. import Scraper, ScraperInput, Site
|
|
from ..utils import (
|
|
extract_emails_from_text,
|
|
get_enum_from_job_type,
|
|
markdown_converter,
|
|
logger,
|
|
)
|
|
from ...jobs import (
|
|
JobPost,
|
|
Compensation,
|
|
CompensationInterval,
|
|
Location,
|
|
JobResponse,
|
|
JobType,
|
|
DescriptionFormat,
|
|
)
|
|
|
|
|
|
class IndeedScraper(Scraper):
|
|
def __init__(self, proxy: str | None = None):
|
|
"""
|
|
Initializes IndeedScraper with the Indeed API url
|
|
"""
|
|
self.scraper_input = None
|
|
self.jobs_per_page = 100
|
|
self.num_workers = 10
|
|
self.seen_urls = set()
|
|
self.headers = None
|
|
self.api_country_code = None
|
|
self.base_url = None
|
|
self.api_url = "https://apis.indeed.com/graphql"
|
|
site = Site(Site.INDEED)
|
|
super().__init__(site, proxy=proxy)
|
|
|
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
|
"""
|
|
Scrapes Indeed for jobs with scraper_input criteria
|
|
:param scraper_input:
|
|
:return: job_response
|
|
"""
|
|
self.scraper_input = scraper_input
|
|
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
|
|
self.base_url = f"https://{domain}.indeed.com"
|
|
self.headers = self.api_headers.copy()
|
|
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
|
|
job_list = []
|
|
page = 1
|
|
|
|
cursor = None
|
|
offset_pages = math.ceil(self.scraper_input.offset / 100)
|
|
for _ in range(offset_pages):
|
|
logger.info(f"Indeed skipping search page: {page}")
|
|
__, cursor = self._scrape_page(cursor)
|
|
if not __:
|
|
logger.info(f"Indeed found no jobs on page: {page}")
|
|
break
|
|
|
|
while len(self.seen_urls) < scraper_input.results_wanted:
|
|
logger.info(f"Indeed search page: {page}")
|
|
jobs, cursor = self._scrape_page(cursor)
|
|
if not jobs:
|
|
logger.info(f"Indeed found no jobs on page: {page}")
|
|
break
|
|
job_list += jobs
|
|
page += 1
|
|
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
|
|
|
|
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
|
|
"""
|
|
Scrapes a page of Indeed for jobs with scraper_input criteria
|
|
:param cursor:
|
|
:return: jobs found on page, next page cursor
|
|
"""
|
|
jobs = []
|
|
new_cursor = None
|
|
filters = self._build_filters()
|
|
search_term = self.scraper_input.search_term.replace('"', '\\"') if self.scraper_input.search_term else ""
|
|
query = self.job_search_query.format(
|
|
what=(
|
|
f'what: "{search_term}"'
|
|
if search_term
|
|
else ""
|
|
),
|
|
location=(
|
|
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
|
|
if self.scraper_input.location
|
|
else ""
|
|
),
|
|
dateOnIndeed=self.scraper_input.hours_old,
|
|
cursor=f'cursor: "{cursor}"' if cursor else "",
|
|
filters=filters,
|
|
)
|
|
payload = {
|
|
"query": query,
|
|
}
|
|
api_headers = self.api_headers.copy()
|
|
api_headers["indeed-co"] = self.api_country_code
|
|
response = requests.post(
|
|
self.api_url,
|
|
headers=api_headers,
|
|
json=payload,
|
|
proxies=self.proxy,
|
|
timeout=10,
|
|
)
|
|
if response.status_code != 200:
|
|
logger.info(
|
|
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
|
)
|
|
return jobs, new_cursor
|
|
data = response.json()
|
|
jobs = data["data"]["jobSearch"]["results"]
|
|
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
|
|
|
|
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
|
job_results: list[Future] = [
|
|
executor.submit(self._process_job, job["job"]) for job in jobs
|
|
]
|
|
job_list = [result.result() for result in job_results if result.result()]
|
|
return job_list, new_cursor
|
|
|
|
def _build_filters(self):
|
|
"""
|
|
Builds the filters dict for job type/is_remote. If hours_old is provided, composite filter for job_type/is_remote is not possible.
|
|
IndeedApply: filters: { keyword: { field: "indeedApplyScope", keys: ["DESKTOP"] } }
|
|
"""
|
|
filters_str = ""
|
|
if self.scraper_input.hours_old:
|
|
filters_str = """
|
|
filters: {{
|
|
date: {{
|
|
field: "dateOnIndeed",
|
|
start: "{start}h"
|
|
}}
|
|
}}
|
|
""".format(
|
|
start=self.scraper_input.hours_old
|
|
)
|
|
if self.scraper_input.easy_apply:
|
|
filters_str = """
|
|
filters: {
|
|
keyword: {
|
|
field: "indeedApplyScope",
|
|
keys: ["DESKTOP"]
|
|
}
|
|
}
|
|
"""
|
|
if self.scraper_input.job_type or self.scraper_input.is_remote:
|
|
job_type_key_mapping = {
|
|
JobType.FULL_TIME: "CF3CP",
|
|
JobType.PART_TIME: "75GKK",
|
|
JobType.CONTRACT: "NJXCK",
|
|
JobType.INTERNSHIP: "VDTG7",
|
|
}
|
|
|
|
keys = []
|
|
if self.scraper_input.job_type:
|
|
key = job_type_key_mapping[self.scraper_input.job_type]
|
|
keys.append(key)
|
|
|
|
if self.scraper_input.is_remote:
|
|
keys.append("DSQF7")
|
|
|
|
if keys:
|
|
keys_str = '", "'.join(keys) # Prepare your keys string
|
|
filters_str = f"""
|
|
filters: {{
|
|
composite: {{
|
|
filters: [{{
|
|
keyword: {{
|
|
field: "attributes",
|
|
keys: ["{keys_str}"]
|
|
}}
|
|
}}]
|
|
}}
|
|
}}
|
|
"""
|
|
return filters_str
|
|
|
|
def _process_job(self, job: dict) -> JobPost | None:
|
|
"""
|
|
Parses the job dict into JobPost model
|
|
:param job: dict to parse
|
|
:return: JobPost if it's a new job
|
|
"""
|
|
job_url = f'{self.base_url}/viewjob?jk={job["key"]}'
|
|
if job_url in self.seen_urls:
|
|
return
|
|
self.seen_urls.add(job_url)
|
|
description = job["description"]["html"]
|
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
|
description = markdown_converter(description)
|
|
|
|
job_type = self._get_job_type(job["attributes"])
|
|
timestamp_seconds = job["datePublished"] / 1000
|
|
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
|
employer = job["employer"].get("dossier") if job["employer"] else None
|
|
employer_details = employer.get("employerDetails", {}) if employer else {}
|
|
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
|
|
return JobPost(
|
|
id=str(job["key"]),
|
|
title=job["title"],
|
|
description=description,
|
|
company_name=job["employer"].get("name") if job.get("employer") else None,
|
|
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
|
|
company_url_direct=(
|
|
employer["links"]["corporateWebsite"] if employer else None
|
|
),
|
|
location=Location(
|
|
city=job.get("location", {}).get("city"),
|
|
state=job.get("location", {}).get("admin1Code"),
|
|
country=job.get("location", {}).get("countryCode"),
|
|
),
|
|
job_type=job_type,
|
|
compensation=self._get_compensation(job),
|
|
date_posted=date_posted,
|
|
job_url=job_url,
|
|
job_url_direct=(
|
|
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
|
|
),
|
|
emails=extract_emails_from_text(description) if description else None,
|
|
is_remote=self._is_job_remote(job, description),
|
|
company_addresses=(
|
|
employer_details["addresses"][0]
|
|
if employer_details.get("addresses")
|
|
else None
|
|
),
|
|
company_industry=(
|
|
employer_details["industry"]
|
|
.replace("Iv1", "")
|
|
.replace("_", " ")
|
|
.title()
|
|
if employer_details.get("industry")
|
|
else None
|
|
),
|
|
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
|
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
|
company_description=employer_details.get("briefDescription"),
|
|
ceo_name=employer_details.get("ceoName"),
|
|
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
|
|
logo_photo_url=(
|
|
employer["images"].get("squareLogoUrl")
|
|
if employer and employer.get("images")
|
|
else None
|
|
),
|
|
banner_photo_url=(
|
|
employer["images"].get("headerImageUrl")
|
|
if employer and employer.get("images")
|
|
else None
|
|
),
|
|
)
|
|
|
|
@staticmethod
|
|
def _get_job_type(attributes: list) -> list[JobType]:
|
|
"""
|
|
Parses the attributes to get list of job types
|
|
:param attributes:
|
|
:return: list of JobType
|
|
"""
|
|
job_types: list[JobType] = []
|
|
for attribute in attributes:
|
|
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
|
job_type = get_enum_from_job_type(job_type_str)
|
|
if job_type:
|
|
job_types.append(job_type)
|
|
return job_types
|
|
|
|
@staticmethod
|
|
def _get_compensation(job: dict) -> Compensation | None:
|
|
"""
|
|
Parses the job to get compensation
|
|
:param job:
|
|
:param job:
|
|
:return: compensation object
|
|
"""
|
|
comp = job["compensation"]["baseSalary"]
|
|
if not comp:
|
|
return None
|
|
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
|
|
if not interval:
|
|
return None
|
|
min_range = comp["range"].get("min")
|
|
max_range = comp["range"].get("max")
|
|
return Compensation(
|
|
interval=interval,
|
|
min_amount=round(min_range, 2) if min_range is not None else None,
|
|
max_amount=round(max_range, 2) if max_range is not None else None,
|
|
currency=job["compensation"]["currencyCode"],
|
|
)
|
|
|
|
@staticmethod
|
|
def _is_job_remote(job: dict, description: str) -> bool:
|
|
"""
|
|
Searches the description, location, and attributes to check if job is remote
|
|
"""
|
|
remote_keywords = ["remote", "work from home", "wfh"]
|
|
is_remote_in_attributes = any(
|
|
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
|
for attr in job["attributes"]
|
|
)
|
|
is_remote_in_description = any(
|
|
keyword in description.lower() for keyword in remote_keywords
|
|
)
|
|
is_remote_in_location = any(
|
|
keyword in job["location"]["formatted"]["long"].lower()
|
|
for keyword in remote_keywords
|
|
)
|
|
return (
|
|
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
|
)
|
|
|
|
@staticmethod
|
|
def _get_compensation_interval(interval: str) -> CompensationInterval:
|
|
interval_mapping = {
|
|
"DAY": "DAILY",
|
|
"YEAR": "YEARLY",
|
|
"HOUR": "HOURLY",
|
|
"WEEK": "WEEKLY",
|
|
"MONTH": "MONTHLY",
|
|
}
|
|
mapped_interval = interval_mapping.get(interval.upper(), None)
|
|
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
|
return CompensationInterval[mapped_interval]
|
|
else:
|
|
raise ValueError(f"Unsupported interval: {interval}")
|
|
|
|
api_headers = {
|
|
"Host": "apis.indeed.com",
|
|
"content-type": "application/json",
|
|
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
|
|
"accept": "application/json",
|
|
"indeed-locale": "en-US",
|
|
"accept-language": "en-US,en;q=0.9",
|
|
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
|
|
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
|
|
}
|
|
job_search_query = """
|
|
query GetJobData {{
|
|
jobSearch(
|
|
{what}
|
|
{location}
|
|
includeSponsoredResults: NONE
|
|
limit: 100
|
|
sort: DATE
|
|
{cursor}
|
|
{filters}
|
|
) {{
|
|
pageInfo {{
|
|
nextCursor
|
|
}}
|
|
results {{
|
|
trackingKey
|
|
job {{
|
|
key
|
|
title
|
|
datePublished
|
|
dateOnIndeed
|
|
description {{
|
|
html
|
|
}}
|
|
location {{
|
|
countryName
|
|
countryCode
|
|
admin1Code
|
|
city
|
|
postalCode
|
|
streetAddress
|
|
formatted {{
|
|
short
|
|
long
|
|
}}
|
|
}}
|
|
compensation {{
|
|
baseSalary {{
|
|
unitOfWork
|
|
range {{
|
|
... on Range {{
|
|
min
|
|
max
|
|
}}
|
|
}}
|
|
}}
|
|
currencyCode
|
|
}}
|
|
attributes {{
|
|
key
|
|
label
|
|
}}
|
|
employer {{
|
|
relativeCompanyPageUrl
|
|
name
|
|
dossier {{
|
|
employerDetails {{
|
|
addresses
|
|
industry
|
|
employeesLocalizedLabel
|
|
revenueLocalizedLabel
|
|
briefDescription
|
|
ceoName
|
|
ceoPhotoUrl
|
|
}}
|
|
images {{
|
|
headerImageUrl
|
|
squareLogoUrl
|
|
}}
|
|
links {{
|
|
corporateWebsite
|
|
}}
|
|
}}
|
|
}}
|
|
recruit {{
|
|
viewJobUrl
|
|
detailedSalary
|
|
workSchedule
|
|
}}
|
|
}}
|
|
}}
|
|
}}
|
|
}}
|
|
"""
|