remove duplicates - gsheets (#29)

This commit is contained in:
Cullen Watson
2023-08-31 10:29:43 -05:00
committed by GitHub
parent 9550886091
commit c4baa79181
6 changed files with 86 additions and 35 deletions

View File

@@ -11,8 +11,33 @@ from settings import *
class CSVFormatter:
@staticmethod
def fetch_job_urls(credentials: Any) -> set:
"""
Fetches all the job urls from the google sheet to prevent duplicates
:param credentials:
:return: urls
"""
try:
gc = gspread.authorize(credentials)
sh = gc.open(GSHEET_NAME)
worksheet = sh.get_worksheet(0)
data = worksheet.get_all_values()
job_urls = set()
for row in data[1:]:
job_urls.add(row[3])
return job_urls
except Exception as e:
raise e
@staticmethod
def upload_to_google_sheet(csv_data: str):
"""
Appends rows to google sheet
:param csv_data:
:return:
"""
try:
scope = [
"https://www.googleapis.com/auth/spreadsheets",
@@ -29,22 +54,28 @@ class CSVFormatter:
data_string = csv_data.getvalue()
reader = csv.reader(StringIO(data_string))
job_urls = CSVFormatter.fetch_job_urls(credentials)
rows = list(reader)
for i, row in enumerate(rows):
if i == 0:
continue
if row[4] in job_urls:
continue
row[6] = format(int(row[6]), ",d") if row[6] else ""
row[7] = format(int(row[7]), ",d") if row[7] else ""
worksheet.append_row(row)
except Exception as e:
raise e
@staticmethod
def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
@staticmethod
def generate_filename() -> str:
"""
Adds a timestamp to the filename header
:return: filename
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
@@ -59,20 +90,17 @@ class CSVFormatter:
writer = csv.writer(output)
headers = [
"Site",
"Title",
"Company Name",
"Job URL",
"Country",
"City",
"State",
"Job Type",
"Compensation Interval",
"Pay Cycle",
"Min Amount",
"Max Amount",
"Currency",
"Date Posted",
"Description",
"Job URL",
]
writer.writerow(headers)
@@ -81,11 +109,8 @@ class CSVFormatter:
for job in job_response["jobs"]:
writer.writerow(
[
site,
job["title"],
job["company_name"],
job["job_url"],
job["location"]["country"],
job["location"]["city"],
job["location"]["state"],
job["job_type"].value if job.get("job_type") else "",
@@ -98,11 +123,9 @@ class CSVFormatter:
job["compensation"]["max_amount"]
if job["compensation"]
else "",
job["compensation"]["currency"]
if job["compensation"]
else "",
job.get("date_posted", ""),
job["description"],
job["job_url"],
]
)