mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
enh: indeed more fields (#126)
This commit is contained in:
@@ -3,6 +3,7 @@ from typing import Tuple
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from .jobs import JobType, Location
|
||||
from .scrapers.utils import logger
|
||||
from .scrapers.indeed import IndeedScraper
|
||||
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||
from .scrapers.glassdoor import GlassdoorScraper
|
||||
@@ -20,7 +21,7 @@ def scrape_jobs(
|
||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||
search_term: str | None = None,
|
||||
location: str | None = None,
|
||||
distance: int | None = None,
|
||||
distance: int | None = 50,
|
||||
is_remote: bool = False,
|
||||
job_type: str | None = None,
|
||||
easy_apply: bool | None = None,
|
||||
@@ -92,6 +93,8 @@ def scrape_jobs(
|
||||
scraper_class = SCRAPER_MAPPING[site]
|
||||
scraper = scraper_class(proxy=proxy)
|
||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||
site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()
|
||||
logger.info(f"{site_name} finished scraping")
|
||||
return site.value, scraped_data
|
||||
|
||||
site_to_jobs_dict = {}
|
||||
@@ -160,11 +163,11 @@ def scrape_jobs(
|
||||
|
||||
# Desired column order
|
||||
desired_order = [
|
||||
"job_url_hyper" if hyperlinks else "job_url",
|
||||
"site",
|
||||
"job_url_hyper" if hyperlinks else "job_url",
|
||||
"job_url_direct",
|
||||
"title",
|
||||
"company",
|
||||
"company_url",
|
||||
"location",
|
||||
"job_type",
|
||||
"date_posted",
|
||||
@@ -173,10 +176,20 @@ def scrape_jobs(
|
||||
"max_amount",
|
||||
"currency",
|
||||
"is_remote",
|
||||
"num_urgent_words",
|
||||
"benefits",
|
||||
"emails",
|
||||
"description",
|
||||
|
||||
"company_url",
|
||||
"company_url_direct",
|
||||
"company_addresses",
|
||||
"company_industry",
|
||||
"company_num_employees",
|
||||
"company_revenue",
|
||||
"company_description",
|
||||
"logo_photo_url",
|
||||
"banner_photo_url",
|
||||
"ceo_name",
|
||||
"ceo_photo_url",
|
||||
]
|
||||
|
||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||||
|
||||
@@ -57,7 +57,7 @@ class JobType(Enum):
|
||||
class Country(Enum):
|
||||
"""
|
||||
Gets the subdomain for Indeed and Glassdoor.
|
||||
The second item in the tuple is the subdomain for Indeed
|
||||
The second item in the tuple is the subdomain (and API country code if there's a ':' separator) for Indeed
|
||||
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
|
||||
"""
|
||||
|
||||
@@ -118,8 +118,8 @@ class Country(Enum):
|
||||
TURKEY = ("turkey", "tr")
|
||||
UKRAINE = ("ukraine", "ua")
|
||||
UNITEDARABEMIRATES = ("united arab emirates", "ae")
|
||||
UK = ("uk,united kingdom", "uk", "co.uk")
|
||||
USA = ("usa,us,united states", "www", "com")
|
||||
UK = ("uk,united kingdom", "uk:gb", "co.uk")
|
||||
USA = ("usa,us,united states", "www:us", "com")
|
||||
URUGUAY = ("uruguay", "uy")
|
||||
VENEZUELA = ("venezuela", "ve")
|
||||
VIETNAM = ("vietnam", "vn", "com")
|
||||
@@ -132,7 +132,10 @@ class Country(Enum):
|
||||
|
||||
@property
|
||||
def indeed_domain_value(self):
|
||||
return self.value[1]
|
||||
subdomain, _, api_country_code = self.value[1].partition(":")
|
||||
if subdomain and api_country_code:
|
||||
return subdomain, api_country_code.upper()
|
||||
return self.value[1], self.value[1].upper()
|
||||
|
||||
@property
|
||||
def glassdoor_domain_value(self):
|
||||
@@ -163,7 +166,7 @@ class Country(Enum):
|
||||
|
||||
|
||||
class Location(BaseModel):
|
||||
country: Country | None = None
|
||||
country: Country | str | None = None
|
||||
city: Optional[str] = None
|
||||
state: Optional[str] = None
|
||||
|
||||
@@ -173,7 +176,9 @@ class Location(BaseModel):
|
||||
location_parts.append(self.city)
|
||||
if self.state:
|
||||
location_parts.append(self.state)
|
||||
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
|
||||
if isinstance(self.country, str):
|
||||
location_parts.append(self.country)
|
||||
elif self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
|
||||
country_name = self.country.value[0]
|
||||
if "," in country_name:
|
||||
country_name = country_name.split(",")[0]
|
||||
@@ -217,21 +222,31 @@ class DescriptionFormat(Enum):
|
||||
|
||||
class JobPost(BaseModel):
|
||||
title: str
|
||||
company_name: str
|
||||
company_name: str | None
|
||||
job_url: str
|
||||
job_url_direct: str | None = None
|
||||
location: Optional[Location]
|
||||
|
||||
description: str | None = None
|
||||
company_url: str | None = None
|
||||
company_url_direct: str | None = None
|
||||
|
||||
job_type: list[JobType] | None = None
|
||||
compensation: Compensation | None = None
|
||||
date_posted: date | None = None
|
||||
benefits: str | None = None
|
||||
emails: list[str] | None = None
|
||||
num_urgent_words: int | None = None
|
||||
is_remote: bool | None = None
|
||||
# company_industry: str | None = None
|
||||
|
||||
# indeed specific
|
||||
company_addresses: str | None = None
|
||||
company_industry: str | None = None
|
||||
company_num_employees: str | None = None
|
||||
company_revenue: str | None = None
|
||||
company_description: str | None = None
|
||||
ceo_name: str | None = None
|
||||
ceo_photo_url: str | None = None
|
||||
logo_photo_url: str | None = None
|
||||
banner_photo_url: str | None = None
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
|
||||
@@ -11,7 +11,7 @@ import requests
|
||||
from typing import Optional
|
||||
from datetime import datetime, timedelta
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from ..utils import count_urgent_words, extract_emails_from_text
|
||||
from ..utils import extract_emails_from_text
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import GlassdoorException
|
||||
@@ -188,7 +188,6 @@ class GlassdoorScraper(Scraper):
|
||||
is_remote=is_remote,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
|
||||
def _fetch_job_description(self, job_id):
|
||||
|
||||
@@ -4,21 +4,15 @@ jobspy.scrapers.indeed
|
||||
|
||||
This module contains routines to scrape Indeed.
|
||||
"""
|
||||
import re
|
||||
import math
|
||||
import json
|
||||
import requests
|
||||
from typing import Any
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
from datetime import datetime
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
import requests
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import (
|
||||
count_urgent_words,
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
get_enum_from_job_type,
|
||||
markdown_converter,
|
||||
logger
|
||||
@@ -32,18 +26,19 @@ from ...jobs import (
|
||||
JobType,
|
||||
DescriptionFormat
|
||||
)
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
|
||||
|
||||
class IndeedScraper(Scraper):
|
||||
def __init__(self, proxy: str | None = None):
|
||||
"""
|
||||
Initializes IndeedScraper with the Indeed job search url
|
||||
Initializes IndeedScraper with the Indeed API url
|
||||
"""
|
||||
self.scraper_input = None
|
||||
self.jobs_per_page = 25
|
||||
self.jobs_per_page = 100
|
||||
self.num_workers = 10
|
||||
self.seen_urls = set()
|
||||
self.headers = None
|
||||
self.api_country_code = None
|
||||
self.base_url = None
|
||||
self.api_url = "https://apis.indeed.com/graphql"
|
||||
site = Site(Site.INDEED)
|
||||
@@ -56,278 +51,220 @@ class IndeedScraper(Scraper):
|
||||
:return: job_response
|
||||
"""
|
||||
self.scraper_input = scraper_input
|
||||
job_list = self._scrape_page()
|
||||
pages_processed = 1
|
||||
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
|
||||
self.base_url = f"https://{domain}.indeed.com"
|
||||
self.headers = self.api_headers.copy()
|
||||
self.headers['indeed-co'] = self.scraper_input.country.indeed_domain_value
|
||||
job_list = []
|
||||
page = 1
|
||||
|
||||
while len(self.seen_urls) < scraper_input.results_wanted:
|
||||
pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page)
|
||||
new_jobs = False
|
||||
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
||||
futures: list[Future] = [
|
||||
executor.submit(self._scrape_page, page + pages_processed)
|
||||
for page in range(pages_to_process)
|
||||
]
|
||||
|
||||
for future in futures:
|
||||
jobs = future.result()
|
||||
if jobs:
|
||||
job_list += jobs
|
||||
new_jobs = True
|
||||
if len(self.seen_urls) >= scraper_input.results_wanted:
|
||||
break
|
||||
|
||||
pages_processed += pages_to_process
|
||||
if not new_jobs:
|
||||
cursor = None
|
||||
offset_pages = math.ceil(self.scraper_input.offset / 100)
|
||||
for _ in range(offset_pages):
|
||||
logger.info(f'Indeed skipping search page: {page}')
|
||||
__, cursor = self._scrape_page(cursor)
|
||||
if not __:
|
||||
logger.info(f'Indeed found no jobs on page: {page}')
|
||||
break
|
||||
|
||||
if len(self.seen_urls) > scraper_input.results_wanted:
|
||||
job_list = job_list[:scraper_input.results_wanted]
|
||||
while len(self.seen_urls) < scraper_input.results_wanted:
|
||||
logger.info(f'Indeed search page: {page}')
|
||||
jobs, cursor = self._scrape_page(cursor)
|
||||
if not jobs:
|
||||
logger.info(f'Indeed found no jobs on page: {page}')
|
||||
break
|
||||
job_list += jobs
|
||||
page += 1
|
||||
return JobResponse(jobs=job_list[:scraper_input.results_wanted])
|
||||
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _scrape_page(self, page: int=0) -> list[JobPost]:
|
||||
def _scrape_page(self, cursor: str | None) -> (list[JobPost], str | None):
|
||||
"""
|
||||
Scrapes a page of Indeed for jobs with scraper_input criteria
|
||||
:param page:
|
||||
:return: jobs found on page, total number of jobs found for search
|
||||
:param cursor:
|
||||
:return: jobs found on page, next page cursor
|
||||
"""
|
||||
logger.info(f'Indeed search page: {page + 1}')
|
||||
job_list = []
|
||||
domain = self.scraper_input.country.indeed_domain_value
|
||||
self.base_url = f"https://{domain}.indeed.com"
|
||||
|
||||
try:
|
||||
session = create_session(self.proxy)
|
||||
response = session.get(
|
||||
f"{self.base_url}/m/jobs",
|
||||
headers=self.headers,
|
||||
params=self._add_params(page),
|
||||
)
|
||||
if response.status_code not in range(200, 400):
|
||||
if response.status_code == 429:
|
||||
logger.error(f'429 Response - Blocked by Indeed for too many requests')
|
||||
else:
|
||||
logger.error(f'Indeed response status code {response.status_code}')
|
||||
return job_list
|
||||
|
||||
except Exception as e:
|
||||
if "Proxy responded with" in str(e):
|
||||
logger.error(f'Indeed: Bad proxy')
|
||||
else:
|
||||
logger.error(f'Indeed: {str(e)}')
|
||||
return job_list
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
if "did not match any jobs" in response.text:
|
||||
return job_list
|
||||
|
||||
jobs = IndeedScraper._parse_jobs(soup)
|
||||
if not jobs:
|
||||
return []
|
||||
if (
|
||||
not jobs.get("metaData", {})
|
||||
.get("mosaicProviderJobCardsModel", {})
|
||||
.get("results")
|
||||
):
|
||||
logger.error("Indeed - No jobs found.")
|
||||
return []
|
||||
|
||||
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||
job_keys = [job['jobkey'] for job in jobs]
|
||||
jobs_detailed = self._get_job_details(job_keys)
|
||||
jobs = []
|
||||
new_cursor = None
|
||||
filters = self._build_filters()
|
||||
query = self.job_search_query.format(
|
||||
what=self.scraper_input.search_term,
|
||||
location=self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
|
||||
radius=self.scraper_input.distance,
|
||||
dateOnIndeed=self.scraper_input.hours_old,
|
||||
cursor=f'cursor: "{cursor}"' if cursor else '',
|
||||
filters=filters
|
||||
)
|
||||
payload = {
|
||||
'query': query,
|
||||
}
|
||||
api_headers = self.api_headers.copy()
|
||||
api_headers['indeed-co'] = self.api_country_code
|
||||
response = requests.post(self.api_url, headers=api_headers, json=payload, proxies=self.proxy, timeout=10)
|
||||
if response.status_code != 200:
|
||||
logger.info(f'Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)')
|
||||
return jobs, new_cursor
|
||||
data = response.json()
|
||||
jobs = data['data']['jobSearch']['results']
|
||||
new_cursor = data['data']['jobSearch']['pageInfo']['nextCursor']
|
||||
|
||||
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
||||
job_results: list[Future] = [
|
||||
executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
|
||||
]
|
||||
executor.submit(self._process_job, job['job']) for job in jobs
|
||||
]
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
return job_list, new_cursor
|
||||
|
||||
return job_list
|
||||
def _build_filters(self):
|
||||
"""
|
||||
Builds the filters dict for job type/is_remote. If hours_old is provided, composite filter for job_type/is_remote is not possible.
|
||||
IndeedApply: filters: { keyword: { field: "indeedApplyScope", keys: ["DESKTOP"] } }
|
||||
"""
|
||||
filters_str = ""
|
||||
if self.scraper_input.hours_old:
|
||||
filters_str = """
|
||||
filters: {{
|
||||
date: {{
|
||||
field: "dateOnIndeed",
|
||||
start: "{start}h"
|
||||
}}
|
||||
}}
|
||||
""".format(start=self.scraper_input.hours_old)
|
||||
elif self.scraper_input.job_type or self.scraper_input.is_remote:
|
||||
job_type_key_mapping = {
|
||||
JobType.FULL_TIME: "CF3CP",
|
||||
JobType.PART_TIME: "75GKK",
|
||||
JobType.CONTRACT: "NJXCK",
|
||||
JobType.INTERNSHIP: "VDTG7",
|
||||
}
|
||||
|
||||
def _process_job(self, job: dict, job_detailed: dict) -> JobPost | None:
|
||||
job_url = f'{self.base_url}/m/jobs/viewjob?jk={job["jobkey"]}'
|
||||
job_url_client = f'{self.base_url}/viewjob?jk={job["jobkey"]}'
|
||||
keys = []
|
||||
if self.scraper_input.job_type:
|
||||
key = job_type_key_mapping[self.scraper_input.job_type]
|
||||
keys.append(key)
|
||||
|
||||
if self.scraper_input.is_remote:
|
||||
keys.append("DSQF7")
|
||||
|
||||
if keys:
|
||||
keys_str = '", "'.join(keys) # Prepare your keys string
|
||||
filters_str = f"""
|
||||
filters: {{
|
||||
composite: {{
|
||||
filters: [{{
|
||||
keyword: {{
|
||||
field: "attributes",
|
||||
keys: ["{keys_str}"]
|
||||
}}
|
||||
}}]
|
||||
}}
|
||||
}}
|
||||
"""
|
||||
return filters_str
|
||||
|
||||
def _process_job(self, job: dict) -> JobPost | None:
|
||||
"""
|
||||
Parses the job dict into JobPost model
|
||||
:param job: dict to parse
|
||||
:return: JobPost if it's a new job
|
||||
"""
|
||||
job_url = f'{self.base_url}/viewjob?jk={job["key"]}'
|
||||
if job_url in self.seen_urls:
|
||||
return None
|
||||
return
|
||||
self.seen_urls.add(job_url)
|
||||
description = job_detailed['description']['html']
|
||||
description = job['description']['html']
|
||||
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
|
||||
job_type = self._get_job_type(job)
|
||||
timestamp_seconds = job["pubDate"] / 1000
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||
|
||||
job_type = self._get_job_type(job['attributes'])
|
||||
timestamp_seconds = job["datePublished"] / 1000
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
||||
employer = job['employer'].get('dossier') if job['employer'] else None
|
||||
employer_details = employer.get('employerDetails', {}) if employer else {}
|
||||
return JobPost(
|
||||
title=job["normTitle"],
|
||||
title=job["title"],
|
||||
description=description,
|
||||
company_name=job["company"],
|
||||
company_url=f"{self.base_url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed[
|
||||
company_name=job['employer'].get("name") if job.get('employer') else None,
|
||||
company_url=f"{self.base_url}{job['employer']['relativeCompanyPageUrl']}" if job[
|
||||
'employer'] else None,
|
||||
company_url_direct=employer['links']['corporateWebsite'] if employer else None,
|
||||
|
||||
location=Location(
|
||||
city=job.get("jobLocationCity"),
|
||||
state=job.get("jobLocationState"),
|
||||
country=self.scraper_input.country,
|
||||
city=job.get("location", {}).get("city"),
|
||||
state=job.get("location", {}).get("admin1Code"),
|
||||
country=job.get("location", {}).get("countryCode"),
|
||||
),
|
||||
job_type=job_type,
|
||||
compensation=self._get_compensation(job, job_detailed),
|
||||
compensation=self._get_compensation(job),
|
||||
date_posted=date_posted,
|
||||
job_url=job_url_client,
|
||||
job_url=job_url,
|
||||
job_url_direct=job['recruit'].get('viewJobUrl') if job.get('recruit') else None,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
is_remote=self._is_job_remote(job, job_detailed, description)
|
||||
is_remote=self._is_job_remote(job, description),
|
||||
|
||||
company_addresses=employer_details['addresses'][0] if employer_details.get('addresses') else None,
|
||||
company_industry=employer_details['industry'].replace('Iv1', '').replace('_', ' ').title() if employer_details.get('industry') else None,
|
||||
company_num_employees=employer_details.get('employeesLocalizedLabel'),
|
||||
company_revenue=employer_details.get('revenueLocalizedLabel'),
|
||||
company_description=employer_details.get('briefDescription'),
|
||||
ceo_name=employer_details.get('ceoName'),
|
||||
ceo_photo_url=employer_details.get('ceoPhotoUrl'),
|
||||
|
||||
logo_photo_url=employer['images'].get('squareLogoUrl') if employer and employer.get('images') else None,
|
||||
banner_photo_url=employer['images'].get('headerImageUrl') if employer and employer.get('images') else None,
|
||||
)
|
||||
|
||||
def _get_job_details(self, job_keys: list[str]) -> dict:
|
||||
"""
|
||||
Queries the GraphQL endpoint for detailed job information for the given job keys.
|
||||
"""
|
||||
job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']'
|
||||
payload = dict(self.api_payload)
|
||||
payload["query"] = self.api_payload["query"].format(job_keys_gql=job_keys_gql)
|
||||
response = requests.post(self.api_url, headers=self.api_headers, json=payload, proxies=self.proxy)
|
||||
if response.status_code == 200:
|
||||
return response.json()['data']['jobData']['results']
|
||||
else:
|
||||
return {}
|
||||
|
||||
def _add_params(self, page: int) -> dict[str, str | Any]:
|
||||
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
|
||||
params = {
|
||||
"q": self.scraper_input.search_term,
|
||||
"l": self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
|
||||
"filter": 0,
|
||||
"start": self.scraper_input.offset + page * 10,
|
||||
"sort": "date",
|
||||
"fromage": fromage,
|
||||
}
|
||||
if self.scraper_input.distance:
|
||||
params["radius"] = self.scraper_input.distance
|
||||
|
||||
sc_values = []
|
||||
if self.scraper_input.is_remote:
|
||||
sc_values.append("attr(DSQF7)")
|
||||
if self.scraper_input.job_type:
|
||||
sc_values.append("jt({})".format(self.scraper_input.job_type.value[0]))
|
||||
|
||||
if sc_values:
|
||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||
|
||||
if self.scraper_input.easy_apply:
|
||||
params['iafilter'] = 1
|
||||
|
||||
return params
|
||||
|
||||
@staticmethod
|
||||
def _get_job_type(job: dict) -> list[JobType] | None:
|
||||
def _get_job_type(attributes: list) -> list[JobType]:
|
||||
"""
|
||||
Parses the job to get list of job types
|
||||
:param job:
|
||||
:return:
|
||||
Parses the attributes to get list of job types
|
||||
:param attributes:
|
||||
:return: list of JobType
|
||||
"""
|
||||
job_types: list[JobType] = []
|
||||
for taxonomy in job["taxonomyAttributes"]:
|
||||
if taxonomy["label"] == "job-types":
|
||||
for i in range(len(taxonomy["attributes"])):
|
||||
label = taxonomy["attributes"][i].get("label")
|
||||
if label:
|
||||
job_type_str = label.replace("-", "").replace(" ", "").lower()
|
||||
job_type = get_enum_from_job_type(job_type_str)
|
||||
if job_type:
|
||||
job_types.append(job_type)
|
||||
for attribute in attributes:
|
||||
job_type_str = attribute['label'].replace("-", "").replace(" ", "").lower()
|
||||
job_type = get_enum_from_job_type(job_type_str)
|
||||
if job_type:
|
||||
job_types.append(job_type)
|
||||
return job_types
|
||||
|
||||
@staticmethod
|
||||
def _get_compensation(job: dict, job_detailed: dict) -> Compensation:
|
||||
def _get_compensation(job: dict) -> Compensation | None:
|
||||
"""
|
||||
Parses the job to get
|
||||
Parses the job to get compensation
|
||||
:param job:
|
||||
:param job:
|
||||
:param job_detailed:
|
||||
:return: compensation object
|
||||
"""
|
||||
comp = job_detailed['compensation']['baseSalary']
|
||||
comp = job['compensation']['baseSalary']
|
||||
if comp:
|
||||
interval = IndeedScraper._get_correct_interval(comp['unitOfWork'])
|
||||
interval = IndeedScraper._get_compensation_interval(comp['unitOfWork'])
|
||||
if interval:
|
||||
return Compensation(
|
||||
interval=interval,
|
||||
min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None,
|
||||
max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None,
|
||||
currency=job_detailed['compensation']['currencyCode']
|
||||
currency=job['compensation']['currencyCode']
|
||||
)
|
||||
|
||||
extracted_salary = job.get("extractedSalary")
|
||||
compensation = None
|
||||
if extracted_salary:
|
||||
salary_snippet = job.get("salarySnippet")
|
||||
currency = salary_snippet.get("currency") if salary_snippet else None
|
||||
interval = (extracted_salary.get("type"),)
|
||||
if isinstance(interval, tuple):
|
||||
interval = interval[0]
|
||||
|
||||
interval = interval.upper()
|
||||
if interval in CompensationInterval.__members__:
|
||||
compensation = Compensation(
|
||||
interval=CompensationInterval[interval],
|
||||
min_amount=int(extracted_salary.get("min")),
|
||||
max_amount=int(extracted_salary.get("max")),
|
||||
currency=currency,
|
||||
)
|
||||
return compensation
|
||||
|
||||
@staticmethod
|
||||
def _parse_jobs(soup: BeautifulSoup) -> dict:
|
||||
def _is_job_remote(job: dict, description: str) -> bool:
|
||||
"""
|
||||
Parses the jobs from the soup object
|
||||
:param soup:
|
||||
:return: jobs
|
||||
Searches the description, location, and attributes to check if job is remote
|
||||
"""
|
||||
def find_mosaic_script() -> Tag | None:
|
||||
script_tags = soup.find_all("script")
|
||||
|
||||
for tag in script_tags:
|
||||
if (
|
||||
tag.string
|
||||
and "mosaic.providerData" in tag.string
|
||||
and "mosaic-provider-jobcards" in tag.string
|
||||
):
|
||||
return tag
|
||||
return None
|
||||
|
||||
script_tag = find_mosaic_script()
|
||||
if script_tag:
|
||||
script_str = script_tag.string
|
||||
pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});'
|
||||
p = re.compile(pattern, re.DOTALL)
|
||||
m = p.search(script_str)
|
||||
if m:
|
||||
jobs = json.loads(m.group(1).strip())
|
||||
return jobs
|
||||
else:
|
||||
logger.warning(f'Indeed: Could not find mosaic provider job cards data')
|
||||
return {}
|
||||
else:
|
||||
logger.warning(f"Indeed: Could not parse any jobs on the page")
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
|
||||
remote_keywords = ['remote', 'work from home', 'wfh']
|
||||
is_remote_in_attributes = any(
|
||||
any(keyword in attr['label'].lower() for keyword in remote_keywords)
|
||||
for attr in job_detailed['attributes']
|
||||
for attr in job['attributes']
|
||||
)
|
||||
is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
|
||||
is_remote_in_location = any(
|
||||
keyword in job_detailed['location']['formatted']['long'].lower()
|
||||
keyword in job['location']['formatted']['long'].lower()
|
||||
for keyword in remote_keywords
|
||||
)
|
||||
is_remote_in_taxonomy = any(
|
||||
taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
|
||||
for taxonomy in job.get("taxonomyAttributes", [])
|
||||
)
|
||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy
|
||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||
|
||||
@staticmethod
|
||||
def _get_correct_interval(interval: str) -> CompensationInterval:
|
||||
def _get_compensation_interval(interval: str) -> CompensationInterval:
|
||||
interval_mapping = {
|
||||
"DAY": "DAILY",
|
||||
"YEAR": "YEARLY",
|
||||
@@ -341,16 +278,6 @@ class IndeedScraper(Scraper):
|
||||
else:
|
||||
raise ValueError(f"Unsupported interval: {interval}")
|
||||
|
||||
headers = {
|
||||
'Host': 'www.indeed.com',
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'sec-fetch-dest': 'document',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'sec-fetch-mode': 'navigate',
|
||||
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
|
||||
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
|
||||
}
|
||||
api_headers = {
|
||||
'Host': 'apis.indeed.com',
|
||||
'content-type': 'application/json',
|
||||
@@ -360,24 +287,35 @@ class IndeedScraper(Scraper):
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1',
|
||||
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
|
||||
'indeed-co': 'US',
|
||||
}
|
||||
api_payload = {
|
||||
"query": """
|
||||
job_search_query = """
|
||||
query GetJobData {{
|
||||
jobData(input: {{
|
||||
jobKeys: {job_keys_gql}
|
||||
}}) {{
|
||||
jobSearch(
|
||||
what: "{what}"
|
||||
location: {{ where: "{location}", radius: {radius}, radiusUnit: MILES }}
|
||||
includeSponsoredResults: NONE
|
||||
limit: 100
|
||||
sort: DATE
|
||||
{cursor}
|
||||
{filters}
|
||||
) {{
|
||||
pageInfo {{
|
||||
nextCursor
|
||||
}}
|
||||
results {{
|
||||
trackingKey
|
||||
job {{
|
||||
key
|
||||
title
|
||||
datePublished
|
||||
dateOnIndeed
|
||||
description {{
|
||||
html
|
||||
}}
|
||||
location {{
|
||||
countryName
|
||||
countryCode
|
||||
admin1Code
|
||||
city
|
||||
postalCode
|
||||
streetAddress
|
||||
@@ -399,10 +337,30 @@ class IndeedScraper(Scraper):
|
||||
currencyCode
|
||||
}}
|
||||
attributes {{
|
||||
key
|
||||
label
|
||||
}}
|
||||
employer {{
|
||||
relativeCompanyPageUrl
|
||||
name
|
||||
dossier {{
|
||||
employerDetails {{
|
||||
addresses
|
||||
industry
|
||||
employeesLocalizedLabel
|
||||
revenueLocalizedLabel
|
||||
briefDescription
|
||||
ceoName
|
||||
ceoPhotoUrl
|
||||
}}
|
||||
images {{
|
||||
headerImageUrl
|
||||
squareLogoUrl
|
||||
}}
|
||||
links {{
|
||||
corporateWebsite
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
recruit {{
|
||||
viewJobUrl
|
||||
@@ -414,4 +372,3 @@ class IndeedScraper(Scraper):
|
||||
}}
|
||||
}}
|
||||
"""
|
||||
}
|
||||
|
||||
@@ -28,7 +28,6 @@ from ...jobs import (
|
||||
)
|
||||
from ..utils import (
|
||||
logger,
|
||||
count_urgent_words,
|
||||
extract_emails_from_text,
|
||||
get_enum_from_job_type,
|
||||
currency_parser,
|
||||
@@ -187,7 +186,6 @@ class LinkedInScraper(Scraper):
|
||||
except:
|
||||
date_posted = None
|
||||
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
||||
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
||||
if full_descr:
|
||||
description, job_type = self._get_job_description(job_url)
|
||||
|
||||
@@ -199,11 +197,9 @@ class LinkedInScraper(Scraper):
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
compensation=compensation,
|
||||
benefits=benefits,
|
||||
job_type=job_type,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
|
||||
def _get_job_description(
|
||||
|
||||
@@ -19,20 +19,6 @@ if not logger.handlers:
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
|
||||
def count_urgent_words(description: str) -> int:
|
||||
"""
|
||||
Count the number of urgent words or phrases in a job description.
|
||||
"""
|
||||
urgent_patterns = re.compile(
|
||||
r"\burgen(t|cy)|\bimmediate(ly)?\b|start asap|\bhiring (now|immediate(ly)?)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
matches = re.findall(urgent_patterns, description)
|
||||
count = len(matches)
|
||||
|
||||
return count
|
||||
|
||||
|
||||
def markdown_converter(description_html: str):
|
||||
if description_html is None:
|
||||
return None
|
||||
|
||||
@@ -14,7 +14,6 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import (
|
||||
logger,
|
||||
count_urgent_words,
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
markdown_converter
|
||||
@@ -161,7 +160,6 @@ class ZipRecruiterScraper(Scraper):
|
||||
job_url=job_url,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
|
||||
def _get_cookies(self):
|
||||
|
||||
Reference in New Issue
Block a user