diff --git a/api/core/formatters/csv/__init__.py b/api/core/formatters/csv/__init__.py index 497efe1..ab89248 100644 --- a/api/core/formatters/csv/__init__.py +++ b/api/core/formatters/csv/__init__.py @@ -11,8 +11,33 @@ from settings import * class CSVFormatter: + @staticmethod + def fetch_job_urls(credentials: Any) -> set: + """ + Fetches all the job urls from the google sheet to prevent duplicates + :param credentials: + :return: urls + """ + try: + gc = gspread.authorize(credentials) + sh = gc.open(GSHEET_NAME) + + worksheet = sh.get_worksheet(0) + data = worksheet.get_all_values() + job_urls = set() + for row in data[1:]: + job_urls.add(row[3]) + return job_urls + except Exception as e: + raise e + @staticmethod def upload_to_google_sheet(csv_data: str): + """ + Appends rows to google sheet + :param csv_data: + :return: + """ try: scope = [ "https://www.googleapis.com/auth/spreadsheets", @@ -29,22 +54,28 @@ class CSVFormatter: data_string = csv_data.getvalue() reader = csv.reader(StringIO(data_string)) + job_urls = CSVFormatter.fetch_job_urls(credentials) + rows = list(reader) for i, row in enumerate(rows): if i == 0: continue + if row[4] in job_urls: + continue + + row[6] = format(int(row[6]), ",d") if row[6] else "" + row[7] = format(int(row[7]), ",d") if row[7] else "" worksheet.append_row(row) except Exception as e: raise e @staticmethod def generate_filename() -> str: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - return f"JobSpy_results_{timestamp}.csv" - - @staticmethod - def generate_filename() -> str: + """ + Adds a timestamp to the filename header + :return: filename + """ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") return f"JobSpy_results_{timestamp}.csv" @@ -59,20 +90,17 @@ class CSVFormatter: writer = csv.writer(output) headers = [ - "Site", "Title", "Company Name", - "Job URL", - "Country", "City", "State", "Job Type", - "Compensation Interval", + "Pay Cycle", "Min Amount", "Max Amount", - "Currency", "Date Posted", "Description", + "Job URL", ] writer.writerow(headers) @@ -81,11 +109,8 @@ class CSVFormatter: for job in job_response["jobs"]: writer.writerow( [ - site, job["title"], job["company_name"], - job["job_url"], - job["location"]["country"], job["location"]["city"], job["location"]["state"], job["job_type"].value if job.get("job_type") else "", @@ -98,11 +123,9 @@ class CSVFormatter: job["compensation"]["max_amount"] if job["compensation"] else "", - job["compensation"]["currency"] - if job["compensation"] - else "", job.get("date_posted", ""), job["description"], + job["job_url"], ] ) diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 63f873c..88c052b 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -1,5 +1,5 @@ from typing import Union -from datetime import datetime +from datetime import date from enum import Enum from pydantic import BaseModel, validator @@ -34,9 +34,9 @@ class CompensationInterval(Enum): class Compensation(BaseModel): interval: CompensationInterval - min_amount: float - max_amount: float - currency: str = "USA" + min_amount: int + max_amount: int + currency: str = "USD" class JobPost(BaseModel): @@ -48,7 +48,8 @@ class JobPost(BaseModel): description: str = None job_type: JobType = None compensation: Compensation = None - date_posted: datetime = None + # why is 08-28-2023 a validiation error for type date? how do I fix this? + date_posted: date = None class JobResponse(BaseModel): diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 9c4e24f..60778f5 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -1,6 +1,7 @@ import re import json from typing import Optional, Tuple, List +from datetime import datetime import tls_client import urllib.parse @@ -14,6 +15,8 @@ from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from concurrent.futures import ThreadPoolExecutor, Future import math +import traceback +import sys class ParsingException(Exception): @@ -69,6 +72,8 @@ class IndeedScraper(Scraper): raise StatusException(response.status_code) soup = BeautifulSoup(response.content, "html.parser") + if "did not match any jobs" in str(soup): + raise ParsingException("Search did not match any jobs") jobs = IndeedScraper.parse_jobs( soup @@ -84,6 +89,7 @@ class IndeedScraper(Scraper): def process_job(job) -> Optional[JobPost]: job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}' + job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}' if job_url in self.seen_urls: return None @@ -102,14 +108,15 @@ class IndeedScraper(Scraper): if interval in CompensationInterval.__members__: compensation = Compensation( interval=CompensationInterval[interval], - min_amount=extracted_salary.get("max"), - max_amount=extracted_salary.get("min"), + min_amount=int(extracted_salary.get("max")), + max_amount=int(extracted_salary.get("min")), currency=currency, ) job_type = IndeedScraper.get_job_type(job) timestamp_seconds = job["pubDate"] / 1000 date_posted = datetime.fromtimestamp(timestamp_seconds) + date_posted = date_posted.strftime("%Y-%m-%d") description = self.get_description(job_url, session) li_elements = snippet_html.find_all("li") @@ -129,7 +136,7 @@ class IndeedScraper(Scraper): job_type=job_type, compensation=compensation, date_posted=date_posted, - job_url=job_url, + job_url=job_url_client, ) return job_post @@ -167,12 +174,12 @@ class IndeedScraper(Scraper): jobs, _ = future.result() job_list += jobs - except StatusException as e: return JobResponse( success=False, error=f"Indeed returned status code {e.status_code}", ) + except ParsingException as e: return JobResponse( success=False, @@ -251,6 +258,7 @@ class IndeedScraper(Scraper): :return: script_tag """ script_tags = soup.find_all("script") + for tag in script_tags: if ( tag.string diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index bdaa58a..cbbafa7 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -1,4 +1,5 @@ from typing import Optional +from datetime import datetime import requests from bs4 import BeautifulSoup diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 905fcc8..15962af 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -1,4 +1,6 @@ +import math import json +from datetime import datetime from typing import Optional, Tuple, List from urllib.parse import urlparse, parse_qs @@ -11,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, Future from api.core.jobs import JobPost from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from api.core.jobs import * -import math class ZipRecruiterScraper(Scraper): @@ -173,6 +174,11 @@ class ZipRecruiterScraper(Scraper): success=False, error=f"ZipRecruiter returned status code {e.status_code}", ) + except Exception as e: + return JobResponse( + success=False, + error=f"ZipRecruiter failed to scrape: {e}", + ) #: note: this does not handle if the results are more or less than the results_wanted @@ -226,7 +232,7 @@ class ZipRecruiterScraper(Scraper): return CompensationInterval(interval_str) @staticmethod - def get_date_posted(job: BeautifulSoup) -> Optional[str]: + def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]: """ Extracts the date a job was posted :param job @@ -235,10 +241,21 @@ class ZipRecruiterScraper(Scraper): button = job.find( "button", {"class": "action_input save_job zrs_btn_secondary_200"} ) - url_time = button["data-href"] + if not button: + return None + + url_time = button.get("data-href", "") url_components = urlparse(url_time) params = parse_qs(url_components.query) - return params.get("posted_time", [None])[0] + posted_time_str = params.get("posted_time", [None])[0] + + if posted_time_str: + posted_date = datetime.strptime( + posted_time_str, "%Y-%m-%dT%H:%M:%SZ" + ).date() + return posted_date + + return None @staticmethod def get_compensation(job: BeautifulSoup) -> Optional[Compensation]: @@ -265,9 +282,9 @@ class ZipRecruiterScraper(Scraper): amount = amount.replace(",", "").strip("$ ").split(" ")[0] if "K" in amount: amount = amount.replace("K", "") - amount = float(amount) * 1000 + amount = int(float(amount)) * 1000 else: - amount = float(amount) + amount = int(float(amount)) amounts.append(amount) compensation = Compensation( diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index 826f21d..d7bb363 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -39,9 +39,8 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse: scraped_data: JobResponse = scraper.scrape(scraper_input) return (site.value, scraped_data) - with ThreadPoolExecutor() as executor: + with ThreadPoolExecutor(max_workers=3) as executor: results = dict(executor.map(scrape_site, scraper_input.site_type)) - scraper_response = CommonResponse(status="JSON response success", **results) if scraper_input.output_format == OutputFormat.CSV: @@ -56,11 +55,13 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse: csv_output = CSVFormatter.format(scraper_response) try: CSVFormatter.upload_to_google_sheet(csv_output) - return CommonResponse(status="Successfully uploaded to Google Sheets") + return CommonResponse( + status="Successfully uploaded to Google Sheets", **results + ) except Exception as e: return CommonResponse( - status="Failed to upload to Google Sheet", error=repr(e) + status="Failed to upload to Google Sheet", error=repr(e), **results ) else: