From 7fd13c888e707b5ca00ac5d4be4a2bae8dfa4f48 Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Tue, 24 Dec 2024 12:29:44 +0200 Subject: [PATCH] added some logs and updated the insert function --- src/jobspy/db/job_repository.py | 78 ++++++++++++++++------- src/jobspy/scrapers/glassdoor/__init__.py | 6 +- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/src/jobspy/db/job_repository.py b/src/jobspy/db/job_repository.py index 45bfd3e..69f1aed 100644 --- a/src/jobspy/db/job_repository.py +++ b/src/jobspy/db/job_repository.py @@ -2,6 +2,7 @@ import os from typing import List from dotenv import load_dotenv from pymongo import MongoClient, UpdateOne +import pymongo from jobspy.jobs import JobPost @@ -28,35 +29,66 @@ class JobRepository: self.collection.insert_one(job_dict) print(f"Inserted new job with title {job.title}.") + # def insertManyIfNotFound(self, jobs: List[JobPost]) -> List[JobPost]: + # """ + # Perform bulk upserts for a list of JobPost objects into a MongoDB collection. + # Only insert new jobs and return the list of newly inserted jobs. + # """ + # operations = [] + # new_jobs = [] # List to store the new jobs inserted into MongoDB + # for job in jobs: + # job_dict = job.model_dump(exclude={"date_posted"}) + # operations.append( + # UpdateOne( + # {"id": job.id}, # Match by `id` + # # Only set fields if the job is being inserted (not updated) + # {"$setOnInsert": job_dict}, + # upsert=True # Insert if not found, but do not update if already exists + # ) + # ) + + # if operations: + # # Execute all operations in bulk + # result = self.collection.bulk_write(operations) + # print(f"Matched: {result.matched_count}, Upserts: { + # result.upserted_count}, Modified: {result.modified_count}") + + # # Get the newly inserted jobs (those that were upserted) + # # The `upserted_count` corresponds to how many new documents were inserted + # for i, job in enumerate(jobs): + # if result.upserted_count > 0 and i < result.upserted_count: + # new_jobs.append(job) + # print(f"New Job ID: {job.id}, Label: {job.title}") + + # return new_jobs + def insertManyIfNotFound(self, jobs: List[JobPost]) -> List[JobPost]: """ - Perform bulk upserts for a list of JobPost objects into a MongoDB collection. + Perform bulk inserts for a list of JobPost objects into a MongoDB collection. Only insert new jobs and return the list of newly inserted jobs. + Args: + jobs (List[JobPost]): List of JobPost objects to insert. + Returns: + List[JobPost]: List of newly inserted JobPost objects. + + Raises: + pymongo.errors.BulkWriteError: If an error occurs during the bulk insert. """ - operations = [] - new_jobs = [] # List to store the new jobs inserted into MongoDB - for job in jobs: - job_dict = job.model_dump(exclude={"date_posted"}) - operations.append( - UpdateOne( - {"id": job.id}, # Match by `id` - # Only set fields if the job is being inserted (not updated) - {"$setOnInsert": job_dict}, - upsert=True # Insert if not found, but do not update if already exists - ) - ) + new_jobs = [] + operations = [ + UpdateOne({"id": job.id}, {"$setOnInsert": job.model_dump( + exclude={"date_posted"})}, upsert=True) + for job in jobs + ] - if operations: - # Execute all operations in bulk + try: result = self.collection.bulk_write(operations) - print(f"Matched: {result.matched_count}, Upserts: { - result.upserted_count}, Modified: {result.modified_count}") + new_jobs = [jobs[i] for i in range( + result.inserted_count)] if result.inserted_count > 0 else [] + print(f"Inserted Jobs: {len(new_jobs)}") - # Get the newly inserted jobs (those that were upserted) - # The `upserted_count` corresponds to how many new documents were inserted - for i, job in enumerate(jobs): - if result.upserted_count > 0 and i < result.upserted_count: - new_jobs.append(job) - print(f"New Job ID: {job.id}, Label: {job.title}") + except pymongo.errors.BulkWriteError as e: + # Handle bulk write errors gracefully, e.g., log details + print(f"Bulk Insert Error: {e}") return new_jobs diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index a5686a0..3f24451 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -111,9 +111,11 @@ class GlassdoorScraper(Scraper): ) if response.status_code != 200: exc_msg = f"bad response status code: {response.status_code}" + logger.error(f"GlassdoorException : {exc_msg}") raise GlassdoorException(exc_msg) res_json = response.json()[0] if "errors" in res_json: + logger.error("Error encountered in API response") raise ValueError("Error encountered in API response") except ( requests.exceptions.ReadTimeout, @@ -136,6 +138,7 @@ class GlassdoorScraper(Scraper): if job_post: jobs.append(job_post) except Exception as exc: + logger.error(f"Glassdoor generated an exception: {exc}") raise GlassdoorException( f"Glassdoor generated an exception: {exc}") @@ -347,7 +350,8 @@ class GlassdoorScraper(Scraper): item for item in items if item.label is not None and formatted_city in item.label ] if not items: - logger.error(f"location not found in Glassdoor: {location}") + logger.error(f"ValueError: Location '{ + location}' not found on Glassdoor") # raise ValueError(f"Location '{location}' not found on Glassdoor") return items