remove duplicates - gsheets (#29)

This commit is contained in:
Cullen Watson
2023-08-31 10:29:43 -05:00
committed by GitHub
parent 9550886091
commit c4baa79181
6 changed files with 86 additions and 35 deletions

View File

@@ -11,8 +11,33 @@ from settings import *
class CSVFormatter:
@staticmethod
def fetch_job_urls(credentials: Any) -> set:
"""
Fetches all the job urls from the google sheet to prevent duplicates
:param credentials:
:return: urls
"""
try:
gc = gspread.authorize(credentials)
sh = gc.open(GSHEET_NAME)
worksheet = sh.get_worksheet(0)
data = worksheet.get_all_values()
job_urls = set()
for row in data[1:]:
job_urls.add(row[3])
return job_urls
except Exception as e:
raise e
@staticmethod
def upload_to_google_sheet(csv_data: str):
"""
Appends rows to google sheet
:param csv_data:
:return:
"""
try:
scope = [
"https://www.googleapis.com/auth/spreadsheets",
@@ -29,22 +54,28 @@ class CSVFormatter:
data_string = csv_data.getvalue()
reader = csv.reader(StringIO(data_string))
job_urls = CSVFormatter.fetch_job_urls(credentials)
rows = list(reader)
for i, row in enumerate(rows):
if i == 0:
continue
if row[4] in job_urls:
continue
row[6] = format(int(row[6]), ",d") if row[6] else ""
row[7] = format(int(row[7]), ",d") if row[7] else ""
worksheet.append_row(row)
except Exception as e:
raise e
@staticmethod
def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
@staticmethod
def generate_filename() -> str:
"""
Adds a timestamp to the filename header
:return: filename
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
@@ -59,20 +90,17 @@ class CSVFormatter:
writer = csv.writer(output)
headers = [
"Site",
"Title",
"Company Name",
"Job URL",
"Country",
"City",
"State",
"Job Type",
"Compensation Interval",
"Pay Cycle",
"Min Amount",
"Max Amount",
"Currency",
"Date Posted",
"Description",
"Job URL",
]
writer.writerow(headers)
@@ -81,11 +109,8 @@ class CSVFormatter:
for job in job_response["jobs"]:
writer.writerow(
[
site,
job["title"],
job["company_name"],
job["job_url"],
job["location"]["country"],
job["location"]["city"],
job["location"]["state"],
job["job_type"].value if job.get("job_type") else "",
@@ -98,11 +123,9 @@ class CSVFormatter:
job["compensation"]["max_amount"]
if job["compensation"]
else "",
job["compensation"]["currency"]
if job["compensation"]
else "",
job.get("date_posted", ""),
job["description"],
job["job_url"],
]
)

View File

@@ -1,5 +1,5 @@
from typing import Union
from datetime import datetime
from datetime import date
from enum import Enum
from pydantic import BaseModel, validator
@@ -34,9 +34,9 @@ class CompensationInterval(Enum):
class Compensation(BaseModel):
interval: CompensationInterval
min_amount: float
max_amount: float
currency: str = "USA"
min_amount: int
max_amount: int
currency: str = "USD"
class JobPost(BaseModel):
@@ -48,7 +48,8 @@ class JobPost(BaseModel):
description: str = None
job_type: JobType = None
compensation: Compensation = None
date_posted: datetime = None
# why is 08-28-2023 a validiation error for type date? how do I fix this?
date_posted: date = None
class JobResponse(BaseModel):

View File

@@ -1,6 +1,7 @@
import re
import json
from typing import Optional, Tuple, List
from datetime import datetime
import tls_client
import urllib.parse
@@ -14,6 +15,8 @@ from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future
import math
import traceback
import sys
class ParsingException(Exception):
@@ -69,6 +72,8 @@ class IndeedScraper(Scraper):
raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")
if "did not match any jobs" in str(soup):
raise ParsingException("Search did not match any jobs")
jobs = IndeedScraper.parse_jobs(
soup
@@ -84,6 +89,7 @@ class IndeedScraper(Scraper):
def process_job(job) -> Optional[JobPost]:
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls:
return None
@@ -102,14 +108,15 @@ class IndeedScraper(Scraper):
if interval in CompensationInterval.__members__:
compensation = Compensation(
interval=CompensationInterval[interval],
min_amount=extracted_salary.get("max"),
max_amount=extracted_salary.get("min"),
min_amount=int(extracted_salary.get("max")),
max_amount=int(extracted_salary.get("min")),
currency=currency,
)
job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url, session)
li_elements = snippet_html.find_all("li")
@@ -129,7 +136,7 @@ class IndeedScraper(Scraper):
job_type=job_type,
compensation=compensation,
date_posted=date_posted,
job_url=job_url,
job_url=job_url_client,
)
return job_post
@@ -167,12 +174,12 @@ class IndeedScraper(Scraper):
jobs, _ = future.result()
job_list += jobs
except StatusException as e:
return JobResponse(
success=False,
error=f"Indeed returned status code {e.status_code}",
)
except ParsingException as e:
return JobResponse(
success=False,
@@ -251,6 +258,7 @@ class IndeedScraper(Scraper):
:return: script_tag
"""
script_tags = soup.find_all("script")
for tag in script_tags:
if (
tag.string

View File

@@ -1,4 +1,5 @@
from typing import Optional
from datetime import datetime
import requests
from bs4 import BeautifulSoup

View File

@@ -1,4 +1,6 @@
import math
import json
from datetime import datetime
from typing import Optional, Tuple, List
from urllib.parse import urlparse, parse_qs
@@ -11,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from api.core.jobs import *
import math
class ZipRecruiterScraper(Scraper):
@@ -173,6 +174,11 @@ class ZipRecruiterScraper(Scraper):
success=False,
error=f"ZipRecruiter returned status code {e.status_code}",
)
except Exception as e:
return JobResponse(
success=False,
error=f"ZipRecruiter failed to scrape: {e}",
)
#: note: this does not handle if the results are more or less than the results_wanted
@@ -226,7 +232,7 @@ class ZipRecruiterScraper(Scraper):
return CompensationInterval(interval_str)
@staticmethod
def get_date_posted(job: BeautifulSoup) -> Optional[str]:
def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]:
"""
Extracts the date a job was posted
:param job
@@ -235,10 +241,21 @@ class ZipRecruiterScraper(Scraper):
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
url_time = button["data-href"]
if not button:
return None
url_time = button.get("data-href", "")
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
return params.get("posted_time", [None])[0]
posted_time_str = params.get("posted_time", [None])[0]
if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date
return None
@staticmethod
def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
@@ -265,9 +282,9 @@ class ZipRecruiterScraper(Scraper):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = float(amount) * 1000
amount = int(float(amount)) * 1000
else:
amount = float(amount)
amount = int(float(amount))
amounts.append(amount)
compensation = Compensation(