JobSpy/jobspy/indeed/__init__.py

261 lines
9.0 KiB
Python
Raw Normal View History

from __future__ import annotations
2023-09-03 07:29:25 -07:00
import math
2023-08-31 08:29:43 -07:00
from datetime import datetime
2025-02-21 12:14:55 -08:00
from typing import Tuple
2023-07-07 19:00:59 -07:00
2025-02-21 12:14:55 -08:00
from jobspy.indeed.constant import job_search_query, api_headers
from jobspy.indeed.util import is_job_remote, get_compensation, get_job_type
from jobspy.model import (
Scraper,
ScraperInput,
Site,
2023-09-03 18:05:31 -07:00
JobPost,
Location,
JobResponse,
JobType,
DescriptionFormat,
2023-09-03 18:05:31 -07:00
)
2025-02-21 12:14:55 -08:00
from jobspy.util import (
extract_emails_from_text,
markdown_converter,
create_session,
create_logger,
)
2023-09-28 16:33:14 -07:00
2025-02-21 10:29:28 -08:00
log = create_logger("Indeed")
2024-10-19 16:01:59 -07:00
2025-02-21 12:14:55 -08:00
class Indeed(Scraper):
2024-10-19 16:01:59 -07:00
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
2024-03-08 23:40:01 -08:00
Initializes IndeedScraper with the Indeed API url
"""
super().__init__(Site.INDEED, proxies=proxies)
2024-10-19 16:01:59 -07:00
self.session = create_session(
proxies=self.proxies, ca_cert=ca_cert, is_tls=False
)
2024-02-14 14:04:23 -08:00
self.scraper_input = None
2024-03-08 23:40:01 -08:00
self.jobs_per_page = 100
2024-02-14 14:04:23 -08:00
self.num_workers = 10
self.seen_urls = set()
2024-03-08 23:40:01 -08:00
self.headers = None
self.api_country_code = None
2024-02-14 14:04:23 -08:00
self.base_url = None
self.api_url = "https://apis.indeed.com/graphql"
2023-07-07 19:00:59 -07:00
2024-02-14 14:04:23 -08:00
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
self.scraper_input = scraper_input
2024-03-08 23:40:01 -08:00
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com"
2024-10-19 16:01:59 -07:00
self.headers = api_headers.copy()
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
2024-03-08 23:40:01 -08:00
job_list = []
page = 1
cursor = None
2024-02-14 14:04:23 -08:00
2024-10-22 17:25:07 -07:00
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
2025-02-21 10:29:28 -08:00
log.info(
2024-10-24 13:19:40 -07:00
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
2024-10-19 16:01:59 -07:00
)
2024-03-08 23:40:01 -08:00
jobs, cursor = self._scrape_page(cursor)
if not jobs:
2025-02-21 10:29:28 -08:00
log.info(f"found no jobs on page: {page}")
2024-02-14 14:04:23 -08:00
break
2024-03-08 23:40:01 -08:00
job_list += jobs
page += 1
2024-10-22 17:25:07 -07:00
return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
+ scraper_input.results_wanted
]
)
2024-02-14 14:04:23 -08:00
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
2024-03-08 23:40:01 -08:00
:param cursor:
:return: jobs found on page, next page cursor
"""
2024-03-08 23:40:01 -08:00
jobs = []
new_cursor = None
filters = self._build_filters()
search_term = (
self.scraper_input.search_term.replace('"', '\\"')
if self.scraper_input.search_term
else ""
)
2024-10-19 16:01:59 -07:00
query = job_search_query.format(
what=(f'what: "{search_term}"' if search_term else ""),
2024-03-11 07:42:43 -07:00
location=(
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
if self.scraper_input.location
else ""
),
2024-03-08 23:40:01 -08:00
dateOnIndeed=self.scraper_input.hours_old,
cursor=f'cursor: "{cursor}"' if cursor else "",
filters=filters,
2024-03-08 23:40:01 -08:00
)
payload = {
"query": query,
2024-03-08 23:40:01 -08:00
}
2024-10-19 16:01:59 -07:00
api_headers_temp = api_headers.copy()
api_headers_temp["indeed-co"] = self.api_country_code
response = self.session.post(
self.api_url,
2024-10-19 16:01:59 -07:00
headers=api_headers_temp,
json=payload,
timeout=10,
2025-02-21 10:29:28 -08:00
verify=False,
)
2024-10-19 16:01:59 -07:00
if not response.ok:
2025-02-21 10:29:28 -08:00
log.info(
2024-10-19 16:01:59 -07:00
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
)
2024-03-08 23:40:01 -08:00
return jobs, new_cursor
data = response.json()
jobs = data["data"]["jobSearch"]["results"]
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
2024-02-09 10:05:10 -08:00
2024-10-19 16:01:59 -07:00
job_list = []
for job in jobs:
processed_job = self._process_job(job["job"])
if processed_job:
job_list.append(processed_job)
2024-03-08 23:40:01 -08:00
return job_list, new_cursor
2023-07-11 09:02:46 -07:00
2024-03-08 23:40:01 -08:00
def _build_filters(self):
"""
Builds the filters dict for job type/is_remote. If hours_old is provided, composite filter for job_type/is_remote is not possible.
IndeedApply: filters: { keyword: { field: "indeedApplyScope", keys: ["DESKTOP"] } }
"""
filters_str = ""
if self.scraper_input.hours_old:
filters_str = """
filters: {{
date: {{
field: "dateOnIndeed",
start: "{start}h"
}}
}}
""".format(
start=self.scraper_input.hours_old
)
2024-03-11 19:23:20 -07:00
elif self.scraper_input.easy_apply:
filters_str = """
filters: {
keyword: {
field: "indeedApplyScope",
keys: ["DESKTOP"]
}
}
"""
2024-03-08 23:40:01 -08:00
elif self.scraper_input.job_type or self.scraper_input.is_remote:
job_type_key_mapping = {
JobType.FULL_TIME: "CF3CP",
JobType.PART_TIME: "75GKK",
JobType.CONTRACT: "NJXCK",
JobType.INTERNSHIP: "VDTG7",
}
keys = []
if self.scraper_input.job_type:
key = job_type_key_mapping[self.scraper_input.job_type]
keys.append(key)
if self.scraper_input.is_remote:
keys.append("DSQF7")
if keys:
2024-07-15 18:30:04 -07:00
keys_str = '", "'.join(keys)
2024-03-08 23:40:01 -08:00
filters_str = f"""
filters: {{
composite: {{
filters: [{{
keyword: {{
field: "attributes",
keys: ["{keys_str}"]
}}
}}]
}}
}}
"""
return filters_str
2023-07-11 09:02:46 -07:00
2024-03-08 23:40:01 -08:00
def _process_job(self, job: dict) -> JobPost | None:
"""
Parses the job dict into JobPost model
:param job: dict to parse
:return: JobPost if it's a new job
"""
job_url = f'{self.base_url}/viewjob?jk={job["key"]}'
2024-02-14 14:04:23 -08:00
if job_url in self.seen_urls:
2024-03-08 23:40:01 -08:00
return
2024-02-14 14:04:23 -08:00
self.seen_urls.add(job_url)
description = job["description"]["html"]
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
2024-03-08 23:40:01 -08:00
2025-02-21 12:14:55 -08:00
job_type = get_job_type(job["attributes"])
2024-03-08 23:40:01 -08:00
timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
employer = job["employer"].get("dossier") if job["employer"] else None
employer_details = employer.get("employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
2024-02-14 14:04:23 -08:00
return JobPost(
2024-10-19 16:01:59 -07:00
id=f'in-{job["key"]}',
2024-03-08 23:40:01 -08:00
title=job["title"],
2024-02-14 14:04:23 -08:00
description=description,
company_name=job["employer"].get("name") if job.get("employer") else None,
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
company_url_direct=(
employer["links"]["corporateWebsite"] if employer else None
),
2024-02-14 14:04:23 -08:00
location=Location(
2024-03-08 23:40:01 -08:00
city=job.get("location", {}).get("city"),
state=job.get("location", {}).get("admin1Code"),
country=job.get("location", {}).get("countryCode"),
2024-02-14 14:04:23 -08:00
),
job_type=job_type,
2025-02-21 12:14:55 -08:00
compensation=get_compensation(job["compensation"]),
2024-02-14 14:04:23 -08:00
date_posted=date_posted,
2024-03-08 23:40:01 -08:00
job_url=job_url,
job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
),
2024-02-14 14:04:23 -08:00
emails=extract_emails_from_text(description) if description else None,
2025-02-21 12:14:55 -08:00
is_remote=is_job_remote(job, description),
company_addresses=(
employer_details["addresses"][0]
if employer_details.get("addresses")
else None
),
company_industry=(
employer_details["industry"]
.replace("Iv1", "")
.replace("_", " ")
.title()
2024-07-15 19:19:01 -07:00
.strip()
if employer_details.get("industry")
else None
),
company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"),
2024-10-24 13:19:40 -07:00
company_logo=(
employer["images"].get("squareLogoUrl")
if employer and employer.get("images")
else None
),
2024-02-14 14:04:23 -08:00
)