added support for indeed with ,multi locations, fixed parsing class to mongo

pull/231/head
Yariv Menachem 2024-12-12 13:37:39 +02:00
parent cb625f325f
commit f0ea89b357
3 changed files with 47 additions and 28 deletions

View File

@ -290,10 +290,14 @@ class JobPost(BaseModel):
job_function: str | None = None job_function: str | None = None
def model_dump(self, exclude: set = None): def model_dump(self, exclude: set = None):
# Use `Location`'s custom serialization logic
data = super().model_dump(exclude=exclude) data = super().model_dump(exclude=exclude)
# Use `Location`'s custom serialization logic
if self.location: if self.location:
data['location'] = self.location.model_dump() data['location'] = self.location.display_location()
# Serialize `job_type` as a list of strings
if self.job_type:
data['job_type'] = [jt.value for jt in self.job_type]
return data return data
@staticmethod @staticmethod

View File

@ -10,7 +10,7 @@ async def main():
jobs = scrape_jobs( jobs = scrape_jobs(
# site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"], # site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
site_name=["linkedin"], site_name=["indeed"],
search_term="software engineer", search_term="software engineer",
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
location="Central, Israel", location="Central, Israel",

View File

@ -68,26 +68,29 @@ class IndeedScraper(Scraper):
job_list = [] job_list = []
page = 1 page = 1
cursor = None for location in self.scraper_input.locations:
cursor = None
logger.info(f"start searching for location: {location}")
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
f"search page: {
page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor, location)
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor)
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
return JobResponse( return JobResponse(
jobs=job_list[ jobs=job_list[
scraper_input.offset : scraper_input.offset scraper_input.offset: scraper_input.offset
+ scraper_input.results_wanted + scraper_input.results_wanted
] ]
) )
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]: def _scrape_page(self, cursor: str | None, location: str) -> Tuple[list[JobPost], str | None]:
""" """
Scrapes a page of Indeed for jobs with scraper_input criteria Scrapes a page of Indeed for jobs with scraper_input criteria
:param cursor: :param cursor:
@ -104,8 +107,9 @@ class IndeedScraper(Scraper):
query = job_search_query.format( query = job_search_query.format(
what=(f'what: "{search_term}"' if search_term else ""), what=(f'what: "{search_term}"' if search_term else ""),
location=( location=(
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}' f'location: {{where: "{location}", radius: {
if self.scraper_input.location self.scraper_input.distance}, radiusUnit: MILES}}'
if location
else "" else ""
), ),
dateOnIndeed=self.scraper_input.hours_old, dateOnIndeed=self.scraper_input.hours_old,
@ -125,7 +129,8 @@ class IndeedScraper(Scraper):
) )
if not response.ok: if not response.ok:
logger.info( logger.info(
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)" f"responded with status code: {
response.status_code} (submit GitHub issue if this appears to be a bug)"
) )
return jobs, new_cursor return jobs, new_cursor
data = response.json() data = response.json()
@ -214,16 +219,20 @@ class IndeedScraper(Scraper):
job_type = self._get_job_type(job["attributes"]) job_type = self._get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000 timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d") date_posted = datetime.fromtimestamp(
timestamp_seconds).strftime("%Y-%m-%d")
employer = job["employer"].get("dossier") if job["employer"] else None employer = job["employer"].get("dossier") if job["employer"] else None
employer_details = employer.get("employerDetails", {}) if employer else {} employer_details = employer.get(
"employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
return JobPost( return JobPost(
id=f'in-{job["key"]}', id=f'in-{job["key"]}',
title=job["title"], title=job["title"],
description=description, description=description,
company_name=job["employer"].get("name") if job.get("employer") else None, company_name=job["employer"].get(
company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None), "name") if job.get("employer") else None,
company_url=(f"{self.base_url}{
rel_url}" if job["employer"] else None),
company_url_direct=( company_url_direct=(
employer["links"]["corporateWebsite"] if employer else None employer["links"]["corporateWebsite"] if employer else None
), ),
@ -235,11 +244,14 @@ class IndeedScraper(Scraper):
job_type=job_type, job_type=job_type,
compensation=self._get_compensation(job["compensation"]), compensation=self._get_compensation(job["compensation"]),
date_posted=date_posted, date_posted=date_posted,
datetime_posted=date_posted,
job_url=job_url, job_url=job_url,
job_url_direct=( job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None job["recruit"].get("viewJobUrl") if job.get(
"recruit") else None
), ),
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(
description) if description else None,
is_remote=self._is_job_remote(job, description), is_remote=self._is_job_remote(job, description),
company_addresses=( company_addresses=(
employer_details["addresses"][0] employer_details["addresses"][0]
@ -255,7 +267,8 @@ class IndeedScraper(Scraper):
if employer_details.get("industry") if employer_details.get("industry")
else None else None
), ),
company_num_employees=employer_details.get("employeesLocalizedLabel"), company_num_employees=employer_details.get(
"employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"), company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"), company_description=employer_details.get("briefDescription"),
company_logo=( company_logo=(
@ -274,7 +287,8 @@ class IndeedScraper(Scraper):
""" """
job_types: list[JobType] = [] job_types: list[JobType] = []
for attribute in attributes: for attribute in attributes:
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower() job_type_str = attribute["label"].replace(
"-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str) job_type = get_enum_from_job_type(job_type_str)
if job_type: if job_type:
job_types.append(job_type) job_types.append(job_type)
@ -319,7 +333,8 @@ class IndeedScraper(Scraper):
""" """
remote_keywords = ["remote", "work from home", "wfh"] remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any( is_remote_in_attributes = any(
any(keyword in attr["label"].lower() for keyword in remote_keywords) any(keyword in attr["label"].lower()
for keyword in remote_keywords)
for attr in job["attributes"] for attr in job["attributes"]
) )
is_remote_in_description = any( is_remote_in_description = any(