mirror of https://github.com/Bunsly/JobSpy
added support for indeed with ,multi locations, fixed parsing class to mongo
parent
cb625f325f
commit
f0ea89b357
|
@ -290,10 +290,14 @@ class JobPost(BaseModel):
|
||||||
job_function: str | None = None
|
job_function: str | None = None
|
||||||
|
|
||||||
def model_dump(self, exclude: set = None):
|
def model_dump(self, exclude: set = None):
|
||||||
# Use `Location`'s custom serialization logic
|
|
||||||
data = super().model_dump(exclude=exclude)
|
data = super().model_dump(exclude=exclude)
|
||||||
|
# Use `Location`'s custom serialization logic
|
||||||
if self.location:
|
if self.location:
|
||||||
data['location'] = self.location.model_dump()
|
data['location'] = self.location.display_location()
|
||||||
|
|
||||||
|
# Serialize `job_type` as a list of strings
|
||||||
|
if self.job_type:
|
||||||
|
data['job_type'] = [jt.value for jt in self.job_type]
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -10,7 +10,7 @@ async def main():
|
||||||
|
|
||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
# site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
# site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
||||||
site_name=["linkedin"],
|
site_name=["indeed"],
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
|
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
|
||||||
location="Central, Israel",
|
location="Central, Israel",
|
||||||
|
|
|
@ -68,26 +68,29 @@ class IndeedScraper(Scraper):
|
||||||
job_list = []
|
job_list = []
|
||||||
page = 1
|
page = 1
|
||||||
|
|
||||||
cursor = None
|
for location in self.scraper_input.locations:
|
||||||
|
cursor = None
|
||||||
|
logger.info(f"start searching for location: {location}")
|
||||||
|
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
||||||
|
logger.info(
|
||||||
|
f"search page: {
|
||||||
|
page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
|
)
|
||||||
|
jobs, cursor = self._scrape_page(cursor, location)
|
||||||
|
if not jobs:
|
||||||
|
logger.info(f"found no jobs on page: {page}")
|
||||||
|
break
|
||||||
|
job_list += jobs
|
||||||
|
page += 1
|
||||||
|
|
||||||
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
|
||||||
logger.info(
|
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
|
||||||
)
|
|
||||||
jobs, cursor = self._scrape_page(cursor)
|
|
||||||
if not jobs:
|
|
||||||
logger.info(f"found no jobs on page: {page}")
|
|
||||||
break
|
|
||||||
job_list += jobs
|
|
||||||
page += 1
|
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
jobs=job_list[
|
jobs=job_list[
|
||||||
scraper_input.offset : scraper_input.offset
|
scraper_input.offset: scraper_input.offset
|
||||||
+ scraper_input.results_wanted
|
+ scraper_input.results_wanted
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
|
def _scrape_page(self, cursor: str | None, location: str) -> Tuple[list[JobPost], str | None]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of Indeed for jobs with scraper_input criteria
|
Scrapes a page of Indeed for jobs with scraper_input criteria
|
||||||
:param cursor:
|
:param cursor:
|
||||||
|
@ -104,8 +107,9 @@ class IndeedScraper(Scraper):
|
||||||
query = job_search_query.format(
|
query = job_search_query.format(
|
||||||
what=(f'what: "{search_term}"' if search_term else ""),
|
what=(f'what: "{search_term}"' if search_term else ""),
|
||||||
location=(
|
location=(
|
||||||
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
|
f'location: {{where: "{location}", radius: {
|
||||||
if self.scraper_input.location
|
self.scraper_input.distance}, radiusUnit: MILES}}'
|
||||||
|
if location
|
||||||
else ""
|
else ""
|
||||||
),
|
),
|
||||||
dateOnIndeed=self.scraper_input.hours_old,
|
dateOnIndeed=self.scraper_input.hours_old,
|
||||||
|
@ -125,7 +129,8 @@ class IndeedScraper(Scraper):
|
||||||
)
|
)
|
||||||
if not response.ok:
|
if not response.ok:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
f"responded with status code: {
|
||||||
|
response.status_code} (submit GitHub issue if this appears to be a bug)"
|
||||||
)
|
)
|
||||||
return jobs, new_cursor
|
return jobs, new_cursor
|
||||||
data = response.json()
|
data = response.json()
|
||||||
|
@ -214,16 +219,20 @@ class IndeedScraper(Scraper):
|
||||||
|
|
||||||
job_type = self._get_job_type(job["attributes"])
|
job_type = self._get_job_type(job["attributes"])
|
||||||
timestamp_seconds = job["datePublished"] / 1000
|
timestamp_seconds = job["datePublished"] / 1000
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
date_posted = datetime.fromtimestamp(
|
||||||
|
timestamp_seconds).strftime("%Y-%m-%d")
|
||||||
employer = job["employer"].get("dossier") if job["employer"] else None
|
employer = job["employer"].get("dossier") if job["employer"] else None
|
||||||
employer_details = employer.get("employerDetails", {}) if employer else {}
|
employer_details = employer.get(
|
||||||
|
"employerDetails", {}) if employer else {}
|
||||||
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
|
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=f'in-{job["key"]}',
|
id=f'in-{job["key"]}',
|
||||||
title=job["title"],
|
title=job["title"],
|
||||||
description=description,
|
description=description,
|
||||||
company_name=job["employer"].get("name") if job.get("employer") else None,
|
company_name=job["employer"].get(
|
||||||
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
|
"name") if job.get("employer") else None,
|
||||||
|
company_url=(f"{self.base_url}{
|
||||||
|
rel_url}" if job["employer"] else None),
|
||||||
company_url_direct=(
|
company_url_direct=(
|
||||||
employer["links"]["corporateWebsite"] if employer else None
|
employer["links"]["corporateWebsite"] if employer else None
|
||||||
),
|
),
|
||||||
|
@ -235,11 +244,14 @@ class IndeedScraper(Scraper):
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=self._get_compensation(job["compensation"]),
|
compensation=self._get_compensation(job["compensation"]),
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
|
datetime_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
job_url_direct=(
|
job_url_direct=(
|
||||||
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
|
job["recruit"].get("viewJobUrl") if job.get(
|
||||||
|
"recruit") else None
|
||||||
),
|
),
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(
|
||||||
|
description) if description else None,
|
||||||
is_remote=self._is_job_remote(job, description),
|
is_remote=self._is_job_remote(job, description),
|
||||||
company_addresses=(
|
company_addresses=(
|
||||||
employer_details["addresses"][0]
|
employer_details["addresses"][0]
|
||||||
|
@ -255,7 +267,8 @@ class IndeedScraper(Scraper):
|
||||||
if employer_details.get("industry")
|
if employer_details.get("industry")
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
company_num_employees=employer_details.get(
|
||||||
|
"employeesLocalizedLabel"),
|
||||||
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
||||||
company_description=employer_details.get("briefDescription"),
|
company_description=employer_details.get("briefDescription"),
|
||||||
company_logo=(
|
company_logo=(
|
||||||
|
@ -274,7 +287,8 @@ class IndeedScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
job_types: list[JobType] = []
|
job_types: list[JobType] = []
|
||||||
for attribute in attributes:
|
for attribute in attributes:
|
||||||
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
job_type_str = attribute["label"].replace(
|
||||||
|
"-", "").replace(" ", "").lower()
|
||||||
job_type = get_enum_from_job_type(job_type_str)
|
job_type = get_enum_from_job_type(job_type_str)
|
||||||
if job_type:
|
if job_type:
|
||||||
job_types.append(job_type)
|
job_types.append(job_type)
|
||||||
|
@ -319,7 +333,8 @@ class IndeedScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
remote_keywords = ["remote", "work from home", "wfh"]
|
remote_keywords = ["remote", "work from home", "wfh"]
|
||||||
is_remote_in_attributes = any(
|
is_remote_in_attributes = any(
|
||||||
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
any(keyword in attr["label"].lower()
|
||||||
|
for keyword in remote_keywords)
|
||||||
for attr in job["attributes"]
|
for attr in job["attributes"]
|
||||||
)
|
)
|
||||||
is_remote_in_description = any(
|
is_remote_in_description = any(
|
||||||
|
|
Loading…
Reference in New Issue