remove duplicates - gsheets (#29)

2026-03-05 20:14:32 -08:00 · 2023-08-31 10:29:43 -05:00
parent 9550886091
commit c4baa79181
6 changed files with 86 additions and 35 deletions
--- a/api/core/formatters/csv/init.py
+++ b/api/core/formatters/csv/init.py
@@ -11,8 +11,33 @@ from settings import *


 class CSVFormatter:
+    @staticmethod
+    def fetch_job_urls(credentials: Any) -> set:
+        """
+        Fetches all the job urls from the google sheet to prevent duplicates
+        :param credentials:
+        :return: urls
+        """
+        try:
+            gc = gspread.authorize(credentials)
+            sh = gc.open(GSHEET_NAME)
+
+            worksheet = sh.get_worksheet(0)
+            data = worksheet.get_all_values()
+            job_urls = set()
+            for row in data[1:]:
+                job_urls.add(row[3])
+            return job_urls
+        except Exception as e:
+            raise e
+
    @staticmethod
    def upload_to_google_sheet(csv_data: str):
+        """
+        Appends rows to google sheet
+        :param csv_data:
+        :return:
+        """
        try:
            scope = [
                "https://www.googleapis.com/auth/spreadsheets",
@@ -29,22 +54,28 @@ class CSVFormatter:
            data_string = csv_data.getvalue()
            reader = csv.reader(StringIO(data_string))

+            job_urls = CSVFormatter.fetch_job_urls(credentials)
+
            rows = list(reader)

            for i, row in enumerate(rows):
                if i == 0:
                    continue
+                if row[4] in job_urls:
+                    continue
+
+                row[6] = format(int(row[6]), ",d") if row[6] else ""
+                row[7] = format(int(row[7]), ",d") if row[7] else ""
                worksheet.append_row(row)
        except Exception as e:
            raise e

    @staticmethod
    def generate_filename() -> str:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        return f"JobSpy_results_{timestamp}.csv"
-
-    @staticmethod
-    def generate_filename() -> str:
+        """
+        Adds a timestamp to the filename header
+        :return: filename
+        """
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"JobSpy_results_{timestamp}.csv"

@@ -59,20 +90,17 @@ class CSVFormatter:
        writer = csv.writer(output)

        headers = [
-            "Site",
            "Title",
            "Company Name",
-            "Job URL",
-            "Country",
            "City",
            "State",
            "Job Type",
-            "Compensation Interval",
+            "Pay Cycle",
            "Min Amount",
            "Max Amount",
-            "Currency",
            "Date Posted",
            "Description",
+            "Job URL",
        ]
        writer.writerow(headers)

@@ -81,11 +109,8 @@ class CSVFormatter:
                for job in job_response["jobs"]:
                    writer.writerow(
                        [
-                            site,
                            job["title"],
                            job["company_name"],
-                            job["job_url"],
-                            job["location"]["country"],
                            job["location"]["city"],
                            job["location"]["state"],
                            job["job_type"].value if job.get("job_type") else "",
@@ -98,11 +123,9 @@ class CSVFormatter:
                            job["compensation"]["max_amount"]
                            if job["compensation"]
                            else "",
-                            job["compensation"]["currency"]
-                            if job["compensation"]
-                            else "",
                            job.get("date_posted", ""),
                            job["description"],
+                            job["job_url"],
                        ]
                    )

--- a/api/core/jobs/init.py
+++ b/api/core/jobs/init.py
@@ -1,5 +1,5 @@
 from typing import Union
-from datetime import datetime
+from datetime import date
 from enum import Enum

 from pydantic import BaseModel, validator
@@ -34,9 +34,9 @@ class CompensationInterval(Enum):

 class Compensation(BaseModel):
    interval: CompensationInterval
-    min_amount: float
-    max_amount: float
-    currency: str = "USA"
+    min_amount: int
+    max_amount: int
+    currency: str = "USD"


 class JobPost(BaseModel):
@@ -48,7 +48,8 @@ class JobPost(BaseModel):
    description: str = None
    job_type: JobType = None
    compensation: Compensation = None
-    date_posted: datetime = None
+    # why is 08-28-2023 a validiation error for type date? how do I fix this?
+    date_posted: date = None


 class JobResponse(BaseModel):
--- a/api/core/scrapers/indeed/init.py
+++ b/api/core/scrapers/indeed/init.py
@@ -1,6 +1,7 @@
 import re
 import json
 from typing import Optional, Tuple, List
+from datetime import datetime

 import tls_client
 import urllib.parse
@@ -14,6 +15,8 @@ from api.core.scrapers import Scraper, ScraperInput, Site, StatusException

 from concurrent.futures import ThreadPoolExecutor, Future
 import math
+import traceback
+import sys


 class ParsingException(Exception):
@@ -69,6 +72,8 @@ class IndeedScraper(Scraper):
            raise StatusException(response.status_code)

        soup = BeautifulSoup(response.content, "html.parser")
+        if "did not match any jobs" in str(soup):
+            raise ParsingException("Search did not match any jobs")

        jobs = IndeedScraper.parse_jobs(
            soup
@@ -84,6 +89,7 @@ class IndeedScraper(Scraper):

        def process_job(job) -> Optional[JobPost]:
            job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
+            job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
            if job_url in self.seen_urls:
                return None

@@ -102,14 +108,15 @@ class IndeedScraper(Scraper):
                if interval in CompensationInterval.__members__:
                    compensation = Compensation(
                        interval=CompensationInterval[interval],
-                        min_amount=extracted_salary.get("max"),
-                        max_amount=extracted_salary.get("min"),
+                        min_amount=int(extracted_salary.get("max")),
+                        max_amount=int(extracted_salary.get("min")),
                        currency=currency,
                    )

            job_type = IndeedScraper.get_job_type(job)
            timestamp_seconds = job["pubDate"] / 1000
            date_posted = datetime.fromtimestamp(timestamp_seconds)
+            date_posted = date_posted.strftime("%Y-%m-%d")

            description = self.get_description(job_url, session)
            li_elements = snippet_html.find_all("li")
@@ -129,7 +136,7 @@ class IndeedScraper(Scraper):
                job_type=job_type,
                compensation=compensation,
                date_posted=date_posted,
-                job_url=job_url,
+                job_url=job_url_client,
            )
            return job_post

@@ -167,12 +174,12 @@ class IndeedScraper(Scraper):
                    jobs, _ = future.result()

                    job_list += jobs
-
        except StatusException as e:
            return JobResponse(
                success=False,
                error=f"Indeed returned status code {e.status_code}",
            )
+
        except ParsingException as e:
            return JobResponse(
                success=False,
@@ -251,6 +258,7 @@ class IndeedScraper(Scraper):
            :return: script_tag
            """
            script_tags = soup.find_all("script")
+
            for tag in script_tags:
                if (
                    tag.string
--- a/api/core/scrapers/linkedin/init.py
+++ b/api/core/scrapers/linkedin/init.py
@@ -1,4 +1,5 @@
 from typing import Optional
+from datetime import datetime

 import requests
 from bs4 import BeautifulSoup
--- a/api/core/scrapers/ziprecruiter/init.py
+++ b/api/core/scrapers/ziprecruiter/init.py
@@ -1,4 +1,6 @@
+import math
 import json
+from datetime import datetime
 from typing import Optional, Tuple, List
 from urllib.parse import urlparse, parse_qs

@@ -11,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
 from api.core.jobs import JobPost
 from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
 from api.core.jobs import *
-import math


 class ZipRecruiterScraper(Scraper):
@@ -173,6 +174,11 @@ class ZipRecruiterScraper(Scraper):
                success=False,
                error=f"ZipRecruiter returned status code {e.status_code}",
            )
+        except Exception as e:
+            return JobResponse(
+                success=False,
+                error=f"ZipRecruiter failed to scrape: {e}",
+            )

        #: note: this does not handle if the results are more or less than the results_wanted

@@ -226,7 +232,7 @@ class ZipRecruiterScraper(Scraper):
        return CompensationInterval(interval_str)

    @staticmethod
-    def get_date_posted(job: BeautifulSoup) -> Optional[str]:
+    def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]:
        """
        Extracts the date a job was posted
        :param job
@@ -235,10 +241,21 @@ class ZipRecruiterScraper(Scraper):
        button = job.find(
            "button", {"class": "action_input save_job zrs_btn_secondary_200"}
        )
-        url_time = button["data-href"]
+        if not button:
+            return None
+
+        url_time = button.get("data-href", "")
        url_components = urlparse(url_time)
        params = parse_qs(url_components.query)
-        return params.get("posted_time", [None])[0]
+        posted_time_str = params.get("posted_time", [None])[0]
+
+        if posted_time_str:
+            posted_date = datetime.strptime(
+                posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
+            ).date()
+            return posted_date
+
+        return None

    @staticmethod
    def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
@@ -265,9 +282,9 @@ class ZipRecruiterScraper(Scraper):
                amount = amount.replace(",", "").strip("$ ").split(" ")[0]
                if "K" in amount:
                    amount = amount.replace("K", "")
-                    amount = float(amount) * 1000
+                    amount = int(float(amount)) * 1000
                else:
-                    amount = float(amount)
+                    amount = int(float(amount))
                amounts.append(amount)

            compensation = Compensation(
--- a/api/v1/jobs/init.py
+++ b/api/v1/jobs/init.py
@@ -39,9 +39,8 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
        scraped_data: JobResponse = scraper.scrape(scraper_input)
        return (site.value, scraped_data)

-    with ThreadPoolExecutor() as executor:
+    with ThreadPoolExecutor(max_workers=3) as executor:
        results = dict(executor.map(scrape_site, scraper_input.site_type))
-
    scraper_response = CommonResponse(status="JSON response success", **results)

    if scraper_input.output_format == OutputFormat.CSV:
@@ -56,11 +55,13 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
        csv_output = CSVFormatter.format(scraper_response)
        try:
            CSVFormatter.upload_to_google_sheet(csv_output)
-            return CommonResponse(status="Successfully uploaded to Google Sheets")
+            return CommonResponse(
+                status="Successfully uploaded to Google Sheets", **results
+            )

        except Exception as e:
            return CommonResponse(
-                status="Failed to upload to Google Sheet", error=repr(e)
+                status="Failed to upload to Google Sheet", error=repr(e), **results
            )

    else: