added support for indeed with ,multi locations, fixed parsing class to mongo

pull/231/head
Yariv Menachem 2024-12-12 13:37:39 +02:00
parent cb625f325f
commit f0ea89b357
3 changed files with 47 additions and 28 deletions

View File

@ -290,10 +290,14 @@ class JobPost(BaseModel):
job_function: str | None = None
def model_dump(self, exclude: set = None):
# Use `Location`'s custom serialization logic
data = super().model_dump(exclude=exclude)
# Use `Location`'s custom serialization logic
if self.location:
data['location'] = self.location.model_dump()
data['location'] = self.location.display_location()
# Serialize `job_type` as a list of strings
if self.job_type:
data['job_type'] = [jt.value for jt in self.job_type]
return data
@staticmethod

View File

@ -10,7 +10,7 @@ async def main():
jobs = scrape_jobs(
# site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
site_name=["linkedin"],
site_name=["indeed"],
search_term="software engineer",
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
location="Central, Israel",

View File

@ -68,26 +68,29 @@ class IndeedScraper(Scraper):
job_list = []
page = 1
cursor = None
for location in self.scraper_input.locations:
cursor = None
logger.info(f"start searching for location: {location}")
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
f"search page: {
page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor, location)
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor)
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
scraper_input.offset: scraper_input.offset
+ scraper_input.results_wanted
]
)
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
def _scrape_page(self, cursor: str | None, location: str) -> Tuple[list[JobPost], str | None]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param cursor:
@ -104,8 +107,9 @@ class IndeedScraper(Scraper):
query = job_search_query.format(
what=(f'what: "{search_term}"' if search_term else ""),
location=(
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
if self.scraper_input.location
f'location: {{where: "{location}", radius: {
self.scraper_input.distance}, radiusUnit: MILES}}'
if location
else ""
),
dateOnIndeed=self.scraper_input.hours_old,
@ -125,7 +129,8 @@ class IndeedScraper(Scraper):
)
if not response.ok:
logger.info(
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
f"responded with status code: {
response.status_code} (submit GitHub issue if this appears to be a bug)"
)
return jobs, new_cursor
data = response.json()
@ -214,16 +219,20 @@ class IndeedScraper(Scraper):
job_type = self._get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
date_posted = datetime.fromtimestamp(
timestamp_seconds).strftime("%Y-%m-%d")
employer = job["employer"].get("dossier") if job["employer"] else None
employer_details = employer.get("employerDetails", {}) if employer else {}
employer_details = employer.get(
"employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
return JobPost(
id=f'in-{job["key"]}',
title=job["title"],
description=description,
company_name=job["employer"].get("name") if job.get("employer") else None,
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
company_name=job["employer"].get(
"name") if job.get("employer") else None,
company_url=(f"{self.base_url}{
rel_url}" if job["employer"] else None),
company_url_direct=(
employer["links"]["corporateWebsite"] if employer else None
),
@ -235,11 +244,14 @@ class IndeedScraper(Scraper):
job_type=job_type,
compensation=self._get_compensation(job["compensation"]),
date_posted=date_posted,
datetime_posted=date_posted,
job_url=job_url,
job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
job["recruit"].get("viewJobUrl") if job.get(
"recruit") else None
),
emails=extract_emails_from_text(description) if description else None,
emails=extract_emails_from_text(
description) if description else None,
is_remote=self._is_job_remote(job, description),
company_addresses=(
employer_details["addresses"][0]
@ -255,7 +267,8 @@ class IndeedScraper(Scraper):
if employer_details.get("industry")
else None
),
company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_num_employees=employer_details.get(
"employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"),
company_logo=(
@ -274,7 +287,8 @@ class IndeedScraper(Scraper):
"""
job_types: list[JobType] = []
for attribute in attributes:
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type_str = attribute["label"].replace(
"-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
@ -319,7 +333,8 @@ class IndeedScraper(Scraper):
"""
remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any(
any(keyword in attr["label"].lower() for keyword in remote_keywords)
any(keyword in attr["label"].lower()
for keyword in remote_keywords)
for attr in job["attributes"]
)
is_remote_in_description = any(