From f0ea89b357201a2ad26e30ce276af7e8159f7aab Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Thu, 12 Dec 2024 13:37:39 +0200 Subject: [PATCH] added support for indeed with ,multi locations, fixed parsing class to mongo --- src/jobspy/jobs/__init__.py | 8 +++- src/jobspy/main.py | 2 +- src/jobspy/scrapers/indeed/__init__.py | 65 ++++++++++++++++---------- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index b96f2c0..beef998 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -290,10 +290,14 @@ class JobPost(BaseModel): job_function: str | None = None def model_dump(self, exclude: set = None): - # Use `Location`'s custom serialization logic data = super().model_dump(exclude=exclude) + # Use `Location`'s custom serialization logic if self.location: - data['location'] = self.location.model_dump() + data['location'] = self.location.display_location() + + # Serialize `job_type` as a list of strings + if self.job_type: + data['job_type'] = [jt.value for jt in self.job_type] return data @staticmethod diff --git a/src/jobspy/main.py b/src/jobspy/main.py index bc6c0ca..e2ec855 100644 --- a/src/jobspy/main.py +++ b/src/jobspy/main.py @@ -10,7 +10,7 @@ async def main(): jobs = scrape_jobs( # site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"], - site_name=["linkedin"], + site_name=["indeed"], search_term="software engineer", google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", location="Central, Israel", diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index bd379ab..05ae16c 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -68,26 +68,29 @@ class IndeedScraper(Scraper): job_list = [] page = 1 - cursor = None + for location in self.scraper_input.locations: + cursor = None + logger.info(f"start searching for location: {location}") + while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset: + logger.info( + f"search page: { + page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" + ) + jobs, cursor = self._scrape_page(cursor, location) + if not jobs: + logger.info(f"found no jobs on page: {page}") + break + job_list += jobs + page += 1 - while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset: - logger.info( - f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" - ) - jobs, cursor = self._scrape_page(cursor) - if not jobs: - logger.info(f"found no jobs on page: {page}") - break - job_list += jobs - page += 1 return JobResponse( jobs=job_list[ - scraper_input.offset : scraper_input.offset + scraper_input.offset: scraper_input.offset + scraper_input.results_wanted ] ) - def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]: + def _scrape_page(self, cursor: str | None, location: str) -> Tuple[list[JobPost], str | None]: """ Scrapes a page of Indeed for jobs with scraper_input criteria :param cursor: @@ -104,8 +107,9 @@ class IndeedScraper(Scraper): query = job_search_query.format( what=(f'what: "{search_term}"' if search_term else ""), location=( - f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}' - if self.scraper_input.location + f'location: {{where: "{location}", radius: { + self.scraper_input.distance}, radiusUnit: MILES}}' + if location else "" ), dateOnIndeed=self.scraper_input.hours_old, @@ -125,7 +129,8 @@ class IndeedScraper(Scraper): ) if not response.ok: logger.info( - f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)" + f"responded with status code: { + response.status_code} (submit GitHub issue if this appears to be a bug)" ) return jobs, new_cursor data = response.json() @@ -214,16 +219,20 @@ class IndeedScraper(Scraper): job_type = self._get_job_type(job["attributes"]) timestamp_seconds = job["datePublished"] / 1000 - date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d") + date_posted = datetime.fromtimestamp( + timestamp_seconds).strftime("%Y-%m-%d") employer = job["employer"].get("dossier") if job["employer"] else None - employer_details = employer.get("employerDetails", {}) if employer else {} + employer_details = employer.get( + "employerDetails", {}) if employer else {} rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None return JobPost( id=f'in-{job["key"]}', title=job["title"], description=description, - company_name=job["employer"].get("name") if job.get("employer") else None, - company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None), + company_name=job["employer"].get( + "name") if job.get("employer") else None, + company_url=(f"{self.base_url}{ + rel_url}" if job["employer"] else None), company_url_direct=( employer["links"]["corporateWebsite"] if employer else None ), @@ -235,11 +244,14 @@ class IndeedScraper(Scraper): job_type=job_type, compensation=self._get_compensation(job["compensation"]), date_posted=date_posted, + datetime_posted=date_posted, job_url=job_url, job_url_direct=( - job["recruit"].get("viewJobUrl") if job.get("recruit") else None + job["recruit"].get("viewJobUrl") if job.get( + "recruit") else None ), - emails=extract_emails_from_text(description) if description else None, + emails=extract_emails_from_text( + description) if description else None, is_remote=self._is_job_remote(job, description), company_addresses=( employer_details["addresses"][0] @@ -255,7 +267,8 @@ class IndeedScraper(Scraper): if employer_details.get("industry") else None ), - company_num_employees=employer_details.get("employeesLocalizedLabel"), + company_num_employees=employer_details.get( + "employeesLocalizedLabel"), company_revenue=employer_details.get("revenueLocalizedLabel"), company_description=employer_details.get("briefDescription"), company_logo=( @@ -274,7 +287,8 @@ class IndeedScraper(Scraper): """ job_types: list[JobType] = [] for attribute in attributes: - job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower() + job_type_str = attribute["label"].replace( + "-", "").replace(" ", "").lower() job_type = get_enum_from_job_type(job_type_str) if job_type: job_types.append(job_type) @@ -319,7 +333,8 @@ class IndeedScraper(Scraper): """ remote_keywords = ["remote", "work from home", "wfh"] is_remote_in_attributes = any( - any(keyword in attr["label"].lower() for keyword in remote_keywords) + any(keyword in attr["label"].lower() + for keyword in remote_keywords) for attr in job["attributes"] ) is_remote_in_description = any(