remove duplicates - gsheets (#29)

pull/30/head^2
Cullen Watson 2023-08-31 10:29:43 -05:00 committed by GitHub
parent 9550886091
commit c4baa79181
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 86 additions and 35 deletions

View File

@ -11,8 +11,33 @@ from settings import *
class CSVFormatter: class CSVFormatter:
@staticmethod
def fetch_job_urls(credentials: Any) -> set:
"""
Fetches all the job urls from the google sheet to prevent duplicates
:param credentials:
:return: urls
"""
try:
gc = gspread.authorize(credentials)
sh = gc.open(GSHEET_NAME)
worksheet = sh.get_worksheet(0)
data = worksheet.get_all_values()
job_urls = set()
for row in data[1:]:
job_urls.add(row[3])
return job_urls
except Exception as e:
raise e
@staticmethod @staticmethod
def upload_to_google_sheet(csv_data: str): def upload_to_google_sheet(csv_data: str):
"""
Appends rows to google sheet
:param csv_data:
:return:
"""
try: try:
scope = [ scope = [
"https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/spreadsheets",
@ -29,22 +54,28 @@ class CSVFormatter:
data_string = csv_data.getvalue() data_string = csv_data.getvalue()
reader = csv.reader(StringIO(data_string)) reader = csv.reader(StringIO(data_string))
job_urls = CSVFormatter.fetch_job_urls(credentials)
rows = list(reader) rows = list(reader)
for i, row in enumerate(rows): for i, row in enumerate(rows):
if i == 0: if i == 0:
continue continue
if row[4] in job_urls:
continue
row[6] = format(int(row[6]), ",d") if row[6] else ""
row[7] = format(int(row[7]), ",d") if row[7] else ""
worksheet.append_row(row) worksheet.append_row(row)
except Exception as e: except Exception as e:
raise e raise e
@staticmethod @staticmethod
def generate_filename() -> str: def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") """
return f"JobSpy_results_{timestamp}.csv" Adds a timestamp to the filename header
:return: filename
@staticmethod """
def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv" return f"JobSpy_results_{timestamp}.csv"
@ -59,20 +90,17 @@ class CSVFormatter:
writer = csv.writer(output) writer = csv.writer(output)
headers = [ headers = [
"Site",
"Title", "Title",
"Company Name", "Company Name",
"Job URL",
"Country",
"City", "City",
"State", "State",
"Job Type", "Job Type",
"Compensation Interval", "Pay Cycle",
"Min Amount", "Min Amount",
"Max Amount", "Max Amount",
"Currency",
"Date Posted", "Date Posted",
"Description", "Description",
"Job URL",
] ]
writer.writerow(headers) writer.writerow(headers)
@ -81,11 +109,8 @@ class CSVFormatter:
for job in job_response["jobs"]: for job in job_response["jobs"]:
writer.writerow( writer.writerow(
[ [
site,
job["title"], job["title"],
job["company_name"], job["company_name"],
job["job_url"],
job["location"]["country"],
job["location"]["city"], job["location"]["city"],
job["location"]["state"], job["location"]["state"],
job["job_type"].value if job.get("job_type") else "", job["job_type"].value if job.get("job_type") else "",
@ -98,11 +123,9 @@ class CSVFormatter:
job["compensation"]["max_amount"] job["compensation"]["max_amount"]
if job["compensation"] if job["compensation"]
else "", else "",
job["compensation"]["currency"]
if job["compensation"]
else "",
job.get("date_posted", ""), job.get("date_posted", ""),
job["description"], job["description"],
job["job_url"],
] ]
) )

View File

@ -1,5 +1,5 @@
from typing import Union from typing import Union
from datetime import datetime from datetime import date
from enum import Enum from enum import Enum
from pydantic import BaseModel, validator from pydantic import BaseModel, validator
@ -34,9 +34,9 @@ class CompensationInterval(Enum):
class Compensation(BaseModel): class Compensation(BaseModel):
interval: CompensationInterval interval: CompensationInterval
min_amount: float min_amount: int
max_amount: float max_amount: int
currency: str = "USA" currency: str = "USD"
class JobPost(BaseModel): class JobPost(BaseModel):
@ -48,7 +48,8 @@ class JobPost(BaseModel):
description: str = None description: str = None
job_type: JobType = None job_type: JobType = None
compensation: Compensation = None compensation: Compensation = None
date_posted: datetime = None # why is 08-28-2023 a validiation error for type date? how do I fix this?
date_posted: date = None
class JobResponse(BaseModel): class JobResponse(BaseModel):

View File

@ -1,6 +1,7 @@
import re import re
import json import json
from typing import Optional, Tuple, List from typing import Optional, Tuple, List
from datetime import datetime
import tls_client import tls_client
import urllib.parse import urllib.parse
@ -14,6 +15,8 @@ from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
import math import math
import traceback
import sys
class ParsingException(Exception): class ParsingException(Exception):
@ -69,6 +72,8 @@ class IndeedScraper(Scraper):
raise StatusException(response.status_code) raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
if "did not match any jobs" in str(soup):
raise ParsingException("Search did not match any jobs")
jobs = IndeedScraper.parse_jobs( jobs = IndeedScraper.parse_jobs(
soup soup
@ -84,6 +89,7 @@ class IndeedScraper(Scraper):
def process_job(job) -> Optional[JobPost]: def process_job(job) -> Optional[JobPost]:
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}' job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
@ -102,14 +108,15 @@ class IndeedScraper(Scraper):
if interval in CompensationInterval.__members__: if interval in CompensationInterval.__members__:
compensation = Compensation( compensation = Compensation(
interval=CompensationInterval[interval], interval=CompensationInterval[interval],
min_amount=extracted_salary.get("max"), min_amount=int(extracted_salary.get("max")),
max_amount=extracted_salary.get("min"), max_amount=int(extracted_salary.get("min")),
currency=currency, currency=currency,
) )
job_type = IndeedScraper.get_job_type(job) job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000 timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url, session) description = self.get_description(job_url, session)
li_elements = snippet_html.find_all("li") li_elements = snippet_html.find_all("li")
@ -129,7 +136,7 @@ class IndeedScraper(Scraper):
job_type=job_type, job_type=job_type,
compensation=compensation, compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url_client,
) )
return job_post return job_post
@ -167,12 +174,12 @@ class IndeedScraper(Scraper):
jobs, _ = future.result() jobs, _ = future.result()
job_list += jobs job_list += jobs
except StatusException as e: except StatusException as e:
return JobResponse( return JobResponse(
success=False, success=False,
error=f"Indeed returned status code {e.status_code}", error=f"Indeed returned status code {e.status_code}",
) )
except ParsingException as e: except ParsingException as e:
return JobResponse( return JobResponse(
success=False, success=False,
@ -251,6 +258,7 @@ class IndeedScraper(Scraper):
:return: script_tag :return: script_tag
""" """
script_tags = soup.find_all("script") script_tags = soup.find_all("script")
for tag in script_tags: for tag in script_tags:
if ( if (
tag.string tag.string

View File

@ -1,4 +1,5 @@
from typing import Optional from typing import Optional
from datetime import datetime
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup

View File

@ -1,4 +1,6 @@
import math
import json import json
from datetime import datetime
from typing import Optional, Tuple, List from typing import Optional, Tuple, List
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
@ -11,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
from api.core.jobs import JobPost from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from api.core.jobs import * from api.core.jobs import *
import math
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@ -173,6 +174,11 @@ class ZipRecruiterScraper(Scraper):
success=False, success=False,
error=f"ZipRecruiter returned status code {e.status_code}", error=f"ZipRecruiter returned status code {e.status_code}",
) )
except Exception as e:
return JobResponse(
success=False,
error=f"ZipRecruiter failed to scrape: {e}",
)
#: note: this does not handle if the results are more or less than the results_wanted #: note: this does not handle if the results are more or less than the results_wanted
@ -226,7 +232,7 @@ class ZipRecruiterScraper(Scraper):
return CompensationInterval(interval_str) return CompensationInterval(interval_str)
@staticmethod @staticmethod
def get_date_posted(job: BeautifulSoup) -> Optional[str]: def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]:
""" """
Extracts the date a job was posted Extracts the date a job was posted
:param job :param job
@ -235,10 +241,21 @@ class ZipRecruiterScraper(Scraper):
button = job.find( button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"} "button", {"class": "action_input save_job zrs_btn_secondary_200"}
) )
url_time = button["data-href"] if not button:
return None
url_time = button.get("data-href", "")
url_components = urlparse(url_time) url_components = urlparse(url_time)
params = parse_qs(url_components.query) params = parse_qs(url_components.query)
return params.get("posted_time", [None])[0] posted_time_str = params.get("posted_time", [None])[0]
if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date
return None
@staticmethod @staticmethod
def get_compensation(job: BeautifulSoup) -> Optional[Compensation]: def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
@ -265,9 +282,9 @@ class ZipRecruiterScraper(Scraper):
amount = amount.replace(",", "").strip("$ ").split(" ")[0] amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount: if "K" in amount:
amount = amount.replace("K", "") amount = amount.replace("K", "")
amount = float(amount) * 1000 amount = int(float(amount)) * 1000
else: else:
amount = float(amount) amount = int(float(amount))
amounts.append(amount) amounts.append(amount)
compensation = Compensation( compensation = Compensation(

View File

@ -39,9 +39,8 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
return (site.value, scraped_data) return (site.value, scraped_data)
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor(max_workers=3) as executor:
results = dict(executor.map(scrape_site, scraper_input.site_type)) results = dict(executor.map(scrape_site, scraper_input.site_type))
scraper_response = CommonResponse(status="JSON response success", **results) scraper_response = CommonResponse(status="JSON response success", **results)
if scraper_input.output_format == OutputFormat.CSV: if scraper_input.output_format == OutputFormat.CSV:
@ -56,11 +55,13 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
csv_output = CSVFormatter.format(scraper_response) csv_output = CSVFormatter.format(scraper_response)
try: try:
CSVFormatter.upload_to_google_sheet(csv_output) CSVFormatter.upload_to_google_sheet(csv_output)
return CommonResponse(status="Successfully uploaded to Google Sheets") return CommonResponse(
status="Successfully uploaded to Google Sheets", **results
)
except Exception as e: except Exception as e:
return CommonResponse( return CommonResponse(
status="Failed to upload to Google Sheet", error=repr(e) status="Failed to upload to Google Sheet", error=repr(e), **results
) )
else: else: