enh(indeed): company url

pull/104/head
Cullen Watson 2024-02-09 09:47:18 -06:00
parent 32282305c8
commit e6270f8eec
3 changed files with 143 additions and 44 deletions

View File

@ -37,7 +37,6 @@ jobs = scrape_jobs(
location="Dallas, TX", location="Dallas, TX",
results_wanted=10, results_wanted=10,
country_indeed='USA' # only needed for indeed / glassdoor country_indeed='USA' # only needed for indeed / glassdoor
# full_description=True (get full description for LinkedIn/Indeed; slower)
) )
print(f"Found {len(jobs)} jobs") print(f"Found {len(jobs)} jobs")
print(jobs.head()) print(jobs.head())
@ -68,7 +67,7 @@ Optional
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] ├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
├── is_remote (bool) ├── is_remote (bool)
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower) ├── linkedin_full_description (bool): fetches full description for LinkedIn (slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on the job board site ├── easy_apply (bool): filters for jobs that are hosted on the job board site
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids ├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids

View File

@ -193,13 +193,20 @@ class CompensationInterval(Enum):
@classmethod @classmethod
def get_interval(cls, pay_period): def get_interval(cls, pay_period):
return cls[pay_period].value if pay_period in cls.__members__ else None interval_mapping = {
"YEAR": cls.YEARLY,
"HOUR": cls.HOURLY,
}
if pay_period in interval_mapping:
return interval_mapping[pay_period].value
else:
return cls[pay_period].value if pay_period in cls.__members__ else None
class Compensation(BaseModel): class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None interval: Optional[CompensationInterval] = None
min_amount: int | None = None min_amount: float | None = None
max_amount: int | None = None max_amount: float | None = None
currency: Optional[str] = "USD" currency: Optional[str] = "USD"

View File

@ -6,8 +6,8 @@ This module contains routines to scrape Indeed.
""" """
import re import re
import math import math
import io
import json import json
import requests
from typing import Any from typing import Any
from datetime import datetime from datetime import datetime
@ -95,28 +95,26 @@ class IndeedScraper(Scraper):
): ):
raise IndeedException("No jobs found.") raise IndeedException("No jobs found.")
def process_job(job: dict) -> JobPost | None: def process_job(job: dict, job_detailed: dict) -> JobPost | None:
job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}' job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}' job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
self.seen_urls.add(job_url)
description = job_detailed['description']['html']
extracted_salary = job.get("extractedSalary")
compensation = None compensation = None
if extracted_salary: comp = job_detailed['compensation']['baseSalary']
salary_snippet = job.get("salarySnippet") if comp:
currency = salary_snippet.get("currency") if salary_snippet else None interval = CompensationInterval.get_interval(comp['unitOfWork'])
interval = (extracted_salary.get("type"),) interval = self.get_correct_interval(comp['unitOfWork'])
if isinstance(interval, tuple): if interval:
interval = interval[0]
interval = interval.upper()
if interval in CompensationInterval.__members__:
compensation = Compensation( compensation = Compensation(
interval=CompensationInterval[interval], interval=interval,
min_amount=int(extracted_salary.get("min")), min_amount=round(comp['range'].get('min', 0), 2) if comp['range'].get(
max_amount=int(extracted_salary.get("max")), 'min') is not None else None,
currency=currency, max_amount = round(comp['range'].get('max', 0), 2) if comp['range'].get('max') is not None else None,
currency=job_detailed['compensation']['currencyCode']
) )
job_type = IndeedScraper.get_job_type(job) job_type = IndeedScraper.get_job_type(job)
@ -124,19 +122,11 @@ class IndeedScraper(Scraper):
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d") date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url) if scraper_input.full_description else None
with io.StringIO(job["snippet"]) as f:
soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup_io.find_all("li")
if description is None and li_elements:
description = " ".join(li.text for li in li_elements)
job_post = JobPost( job_post = JobPost(
title=job["normTitle"], title=job["normTitle"],
description=description, description=description,
company_name=job["company"], company_name=job["company"],
company_url=self.url + job["companyOverviewLink"] if "companyOverviewLink" in job else None, company_url=f"{self.url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed['employer'] else None,
location=Location( location=Location(
city=job.get("jobLocationCity"), city=job.get("jobLocationCity"),
state=job.get("jobLocationState"), state=job.get("jobLocationState"),
@ -150,15 +140,19 @@ class IndeedScraper(Scraper):
num_urgent_words=count_urgent_words(description) num_urgent_words=count_urgent_words(description)
if description if description
else None, else None,
is_remote=self.is_remote_job(job), is_remote=IndeedScraper.is_job_remote(job, job_detailed, description)
) )
return job_post return job_post
workers = 10 if scraper_input.full_description else 10 # possibly lessen 10 when fetching desc based on feedback workers = 10
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs]
jobs_detailed = self.get_job_details(job_keys)
with ThreadPoolExecutor(max_workers=workers) as executor: with ThreadPoolExecutor(max_workers=workers) as executor:
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(process_job, job) for job in jobs executor.submit(process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
@ -333,17 +327,6 @@ class IndeedScraper(Scraper):
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3', 'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
} }
@staticmethod
def is_remote_job(job: dict) -> bool:
"""
:param job:
:return: bool
"""
for taxonomy in job.get("taxonomyAttributes", []):
if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0:
return True
return False
@staticmethod @staticmethod
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]: def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
params = { params = {
@ -369,3 +352,113 @@ class IndeedScraper(Scraper):
params['iafilter'] = 1 params['iafilter'] = 1
return params return params
@staticmethod
def is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
remote_keywords = ['remote', 'work from home', 'wfh']
is_remote_in_attributes = any(
any(keyword in attr['label'].lower() for keyword in remote_keywords)
for attr in job_detailed['attributes']
)
is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
is_remote_in_location = any(
keyword in job_detailed['location']['formatted']['long'].lower()
for keyword in remote_keywords
)
is_remote_in_taxonomy = any(
taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
for taxonomy in job.get("taxonomyAttributes", [])
)
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
@staticmethod
def get_job_details(job_keys: list[str]) -> dict:
"""
Queries the GraphQL endpoint for detailed job information for the given job keys.
"""
url = "https://apis.indeed.com/graphql"
headers = {
'Host': 'apis.indeed.com',
'content-type': 'application/json',
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8',
'accept': 'application/json',
'indeed-locale': 'en-US',
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1',
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
'indeed-co': 'US',
}
job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']'
payload = {
"query": f"""
query GetJobData {{
jobData(input: {{
jobKeys: {job_keys_gql}
}}) {{
results {{
job {{
key
title
description {{
html
}}
location {{
countryName
countryCode
city
postalCode
streetAddress
formatted {{
short
long
}}
}}
compensation {{
baseSalary {{
unitOfWork
range {{
... on Range {{
min
max
}}
}}
}}
currencyCode
}}
attributes {{
label
}}
employer {{
relativeCompanyPageUrl
}}
recruit {{
viewJobUrl
detailedSalary
workSchedule
}}
}}
}}
}}
}}
"""
}
response = requests.post(url, headers=headers, json=payload)
if response.status_code == 200:
return response.json()['data']['jobData']['results']
else:
return {}
@staticmethod
def get_correct_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"MONTH": "MONTHLY"
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")