mirror of https://github.com/Bunsly/JobSpy
added some logs and updated the insert function
parent
54022f2b57
commit
7fd13c888e
|
@ -2,6 +2,7 @@ import os
|
||||||
from typing import List
|
from typing import List
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from pymongo import MongoClient, UpdateOne
|
from pymongo import MongoClient, UpdateOne
|
||||||
|
import pymongo
|
||||||
|
|
||||||
from jobspy.jobs import JobPost
|
from jobspy.jobs import JobPost
|
||||||
|
|
||||||
|
@ -28,35 +29,66 @@ class JobRepository:
|
||||||
self.collection.insert_one(job_dict)
|
self.collection.insert_one(job_dict)
|
||||||
print(f"Inserted new job with title {job.title}.")
|
print(f"Inserted new job with title {job.title}.")
|
||||||
|
|
||||||
|
# def insertManyIfNotFound(self, jobs: List[JobPost]) -> List[JobPost]:
|
||||||
|
# """
|
||||||
|
# Perform bulk upserts for a list of JobPost objects into a MongoDB collection.
|
||||||
|
# Only insert new jobs and return the list of newly inserted jobs.
|
||||||
|
# """
|
||||||
|
# operations = []
|
||||||
|
# new_jobs = [] # List to store the new jobs inserted into MongoDB
|
||||||
|
# for job in jobs:
|
||||||
|
# job_dict = job.model_dump(exclude={"date_posted"})
|
||||||
|
# operations.append(
|
||||||
|
# UpdateOne(
|
||||||
|
# {"id": job.id}, # Match by `id`
|
||||||
|
# # Only set fields if the job is being inserted (not updated)
|
||||||
|
# {"$setOnInsert": job_dict},
|
||||||
|
# upsert=True # Insert if not found, but do not update if already exists
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
|
# if operations:
|
||||||
|
# # Execute all operations in bulk
|
||||||
|
# result = self.collection.bulk_write(operations)
|
||||||
|
# print(f"Matched: {result.matched_count}, Upserts: {
|
||||||
|
# result.upserted_count}, Modified: {result.modified_count}")
|
||||||
|
|
||||||
|
# # Get the newly inserted jobs (those that were upserted)
|
||||||
|
# # The `upserted_count` corresponds to how many new documents were inserted
|
||||||
|
# for i, job in enumerate(jobs):
|
||||||
|
# if result.upserted_count > 0 and i < result.upserted_count:
|
||||||
|
# new_jobs.append(job)
|
||||||
|
# print(f"New Job ID: {job.id}, Label: {job.title}")
|
||||||
|
|
||||||
|
# return new_jobs
|
||||||
|
|
||||||
def insertManyIfNotFound(self, jobs: List[JobPost]) -> List[JobPost]:
|
def insertManyIfNotFound(self, jobs: List[JobPost]) -> List[JobPost]:
|
||||||
"""
|
"""
|
||||||
Perform bulk upserts for a list of JobPost objects into a MongoDB collection.
|
Perform bulk inserts for a list of JobPost objects into a MongoDB collection.
|
||||||
Only insert new jobs and return the list of newly inserted jobs.
|
Only insert new jobs and return the list of newly inserted jobs.
|
||||||
|
Args:
|
||||||
|
jobs (List[JobPost]): List of JobPost objects to insert.
|
||||||
|
Returns:
|
||||||
|
List[JobPost]: List of newly inserted JobPost objects.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
pymongo.errors.BulkWriteError: If an error occurs during the bulk insert.
|
||||||
"""
|
"""
|
||||||
operations = []
|
new_jobs = []
|
||||||
new_jobs = [] # List to store the new jobs inserted into MongoDB
|
operations = [
|
||||||
for job in jobs:
|
UpdateOne({"id": job.id}, {"$setOnInsert": job.model_dump(
|
||||||
job_dict = job.model_dump(exclude={"date_posted"})
|
exclude={"date_posted"})}, upsert=True)
|
||||||
operations.append(
|
for job in jobs
|
||||||
UpdateOne(
|
]
|
||||||
{"id": job.id}, # Match by `id`
|
|
||||||
# Only set fields if the job is being inserted (not updated)
|
|
||||||
{"$setOnInsert": job_dict},
|
|
||||||
upsert=True # Insert if not found, but do not update if already exists
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if operations:
|
try:
|
||||||
# Execute all operations in bulk
|
|
||||||
result = self.collection.bulk_write(operations)
|
result = self.collection.bulk_write(operations)
|
||||||
print(f"Matched: {result.matched_count}, Upserts: {
|
new_jobs = [jobs[i] for i in range(
|
||||||
result.upserted_count}, Modified: {result.modified_count}")
|
result.inserted_count)] if result.inserted_count > 0 else []
|
||||||
|
print(f"Inserted Jobs: {len(new_jobs)}")
|
||||||
|
|
||||||
# Get the newly inserted jobs (those that were upserted)
|
except pymongo.errors.BulkWriteError as e:
|
||||||
# The `upserted_count` corresponds to how many new documents were inserted
|
# Handle bulk write errors gracefully, e.g., log details
|
||||||
for i, job in enumerate(jobs):
|
print(f"Bulk Insert Error: {e}")
|
||||||
if result.upserted_count > 0 and i < result.upserted_count:
|
|
||||||
new_jobs.append(job)
|
|
||||||
print(f"New Job ID: {job.id}, Label: {job.title}")
|
|
||||||
|
|
||||||
return new_jobs
|
return new_jobs
|
||||||
|
|
|
@ -111,9 +111,11 @@ class GlassdoorScraper(Scraper):
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
exc_msg = f"bad response status code: {response.status_code}"
|
exc_msg = f"bad response status code: {response.status_code}"
|
||||||
|
logger.error(f"GlassdoorException : {exc_msg}")
|
||||||
raise GlassdoorException(exc_msg)
|
raise GlassdoorException(exc_msg)
|
||||||
res_json = response.json()[0]
|
res_json = response.json()[0]
|
||||||
if "errors" in res_json:
|
if "errors" in res_json:
|
||||||
|
logger.error("Error encountered in API response")
|
||||||
raise ValueError("Error encountered in API response")
|
raise ValueError("Error encountered in API response")
|
||||||
except (
|
except (
|
||||||
requests.exceptions.ReadTimeout,
|
requests.exceptions.ReadTimeout,
|
||||||
|
@ -136,6 +138,7 @@ class GlassdoorScraper(Scraper):
|
||||||
if job_post:
|
if job_post:
|
||||||
jobs.append(job_post)
|
jobs.append(job_post)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
logger.error(f"Glassdoor generated an exception: {exc}")
|
||||||
raise GlassdoorException(
|
raise GlassdoorException(
|
||||||
f"Glassdoor generated an exception: {exc}")
|
f"Glassdoor generated an exception: {exc}")
|
||||||
|
|
||||||
|
@ -347,7 +350,8 @@ class GlassdoorScraper(Scraper):
|
||||||
item for item in items if item.label is not None and formatted_city in item.label
|
item for item in items if item.label is not None and formatted_city in item.label
|
||||||
]
|
]
|
||||||
if not items:
|
if not items:
|
||||||
logger.error(f"location not found in Glassdoor: {location}")
|
logger.error(f"ValueError: Location '{
|
||||||
|
location}' not found on Glassdoor")
|
||||||
# raise ValueError(f"Location '{location}' not found on Glassdoor")
|
# raise ValueError(f"Location '{location}' not found on Glassdoor")
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
Loading…
Reference in New Issue