remove duplicates - gsheets (#29)

2023-08-31 10:29:43 -05:00 · 2023-08-31 10:29:43 -05:00 · c4baa79181
parent 9550886091
commit c4baa79181
6 changed files with 86 additions and 35 deletions
--- a/api/core/formatters/csv/init.py
+++ b/api/core/formatters/csv/init.py
@ -11,8 +11,33 @@ from settings import *
 class CSVFormatter:
    @staticmethod
    def fetch_job_urls(credentials: Any) -> set:
        """
        Fetches all the job urls from the google sheet to prevent duplicates
        :param credentials:
        :return: urls
        """
        try:
            gc = gspread.authorize(credentials)
            sh = gc.open(GSHEET_NAME)
            worksheet = sh.get_worksheet(0)
            data = worksheet.get_all_values()
            job_urls = set()
            for row in data[1:]:
                job_urls.add(row[3])
            return job_urls
        except Exception as e:
            raise e
    @staticmethod
    def upload_to_google_sheet(csv_data: str):
        """
        Appends rows to google sheet
        :param csv_data:
        :return:
        """
        try:
            scope = [
                "https://www.googleapis.com/auth/spreadsheets",
@ -29,22 +54,28 @@ class CSVFormatter:
            data_string = csv_data.getvalue()
            reader = csv.reader(StringIO(data_string))
            job_urls = CSVFormatter.fetch_job_urls(credentials)
            rows = list(reader)
            for i, row in enumerate(rows):
                if i == 0:
                    continue
                if row[4] in job_urls:
                    continue
                row[6] = format(int(row[6]), ",d") if row[6] else ""
                row[7] = format(int(row[7]), ",d") if row[7] else ""
                worksheet.append_row(row)
        except Exception as e:
            raise e
    @staticmethod
    def generate_filename() -> str:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        """
-        return f"JobSpy_results_{timestamp}.csv"
+        Adds a timestamp to the filename header
-
+        :return: filename
-    @staticmethod
+        """
    def generate_filename() -> str:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"JobSpy_results_{timestamp}.csv"
@ -59,20 +90,17 @@ class CSVFormatter:
        writer = csv.writer(output)
        headers = [
            "Site",
            "Title",
            "Company Name",
            "Job URL",
            "Country",
            "City",
            "State",
            "Job Type",
-            "Compensation Interval",
+            "Pay Cycle",
            "Min Amount",
            "Max Amount",
            "Currency",
            "Date Posted",
            "Description",
            "Job URL",
        ]
        writer.writerow(headers)
@ -81,11 +109,8 @@ class CSVFormatter:
                for job in job_response["jobs"]:
                    writer.writerow(
                        [
                            site,
                            job["title"],
                            job["company_name"],
                            job["job_url"],
                            job["location"]["country"],
                            job["location"]["city"],
                            job["location"]["state"],
                            job["job_type"].value if job.get("job_type") else "",
@ -98,11 +123,9 @@ class CSVFormatter:
                            job["compensation"]["max_amount"]
                            if job["compensation"]
                            else "",
                            job["compensation"]["currency"]
                            if job["compensation"]
                            else "",
                            job.get("date_posted", ""),
                            job["description"],
                            job["job_url"],
                        ]
                    )
--- a/api/core/jobs/init.py
+++ b/api/core/jobs/init.py
@ -1,5 +1,5 @@
 from typing import Union
-from datetime import datetime
+from datetime import date
 from enum import Enum
 from pydantic import BaseModel, validator
@ -34,9 +34,9 @@ class CompensationInterval(Enum):
 class Compensation(BaseModel):
    interval: CompensationInterval
-    min_amount: float
+    min_amount: int
-    max_amount: float
+    max_amount: int
-    currency: str = "USA"
+    currency: str = "USD"
 class JobPost(BaseModel):
@ -48,7 +48,8 @@ class JobPost(BaseModel):
    description: str = None
    job_type: JobType = None
    compensation: Compensation = None
-    date_posted: datetime = None
+    # why is 08-28-2023 a validiation error for type date? how do I fix this?
    date_posted: date = None
 class JobResponse(BaseModel):
--- a/api/core/scrapers/indeed/init.py
+++ b/api/core/scrapers/indeed/init.py
@ -1,6 +1,7 @@
 import re
 import json
 from typing import Optional, Tuple, List
 from datetime import datetime
 import tls_client
 import urllib.parse
@ -14,6 +15,8 @@ from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
 from concurrent.futures import ThreadPoolExecutor, Future
 import math
 import traceback
 import sys
 class ParsingException(Exception):
@ -69,6 +72,8 @@ class IndeedScraper(Scraper):
            raise StatusException(response.status_code)
        soup = BeautifulSoup(response.content, "html.parser")
        if "did not match any jobs" in str(soup):
            raise ParsingException("Search did not match any jobs")
        jobs = IndeedScraper.parse_jobs(
            soup
@ -84,6 +89,7 @@ class IndeedScraper(Scraper):
        def process_job(job) -> Optional[JobPost]:
            job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
            job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
            if job_url in self.seen_urls:
                return None
@ -102,14 +108,15 @@ class IndeedScraper(Scraper):
                if interval in CompensationInterval.__members__:
                    compensation = Compensation(
                        interval=CompensationInterval[interval],
-                        min_amount=extracted_salary.get("max"),
+                        min_amount=int(extracted_salary.get("max")),
-                        max_amount=extracted_salary.get("min"),
+                        max_amount=int(extracted_salary.get("min")),
                        currency=currency,
                    )
            job_type = IndeedScraper.get_job_type(job)
            timestamp_seconds = job["pubDate"] / 1000
            date_posted = datetime.fromtimestamp(timestamp_seconds)
            date_posted = date_posted.strftime("%Y-%m-%d")
            description = self.get_description(job_url, session)
            li_elements = snippet_html.find_all("li")
@ -129,7 +136,7 @@ class IndeedScraper(Scraper):
                job_type=job_type,
                compensation=compensation,
                date_posted=date_posted,
-                job_url=job_url,
+                job_url=job_url_client,
            )
            return job_post
@ -167,12 +174,12 @@ class IndeedScraper(Scraper):
                    jobs, _ = future.result()
                    job_list += jobs
        except StatusException as e:
            return JobResponse(
                success=False,
                error=f"Indeed returned status code {e.status_code}",
            )
        except ParsingException as e:
            return JobResponse(
                success=False,
@ -251,6 +258,7 @@ class IndeedScraper(Scraper):
            :return: script_tag
            """
            script_tags = soup.find_all("script")
            for tag in script_tags:
                if (
                    tag.string
--- a/api/core/scrapers/linkedin/init.py
+++ b/api/core/scrapers/linkedin/init.py
@ -1,4 +1,5 @@
 from typing import Optional
 from datetime import datetime
 import requests
 from bs4 import BeautifulSoup
--- a/api/core/scrapers/ziprecruiter/init.py
+++ b/api/core/scrapers/ziprecruiter/init.py
@ -1,4 +1,6 @@
 import math
 import json
 from datetime import datetime
 from typing import Optional, Tuple, List
 from urllib.parse import urlparse, parse_qs
@ -11,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
 from api.core.jobs import JobPost
 from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
 from api.core.jobs import *
 import math
 class ZipRecruiterScraper(Scraper):
@ -173,6 +174,11 @@ class ZipRecruiterScraper(Scraper):
                success=False,
                error=f"ZipRecruiter returned status code {e.status_code}",
            )
        except Exception as e:
            return JobResponse(
                success=False,
                error=f"ZipRecruiter failed to scrape: {e}",
            )
        #: note: this does not handle if the results are more or less than the results_wanted
@ -226,7 +232,7 @@ class ZipRecruiterScraper(Scraper):
        return CompensationInterval(interval_str)
    @staticmethod
-    def get_date_posted(job: BeautifulSoup) -> Optional[str]:
+    def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]:
        """
        Extracts the date a job was posted
        :param job
@ -235,10 +241,21 @@ class ZipRecruiterScraper(Scraper):
        button = job.find(
            "button", {"class": "action_input save_job zrs_btn_secondary_200"}
        )
-        url_time = button["data-href"]
+        if not button:
            return None
        url_time = button.get("data-href", "")
        url_components = urlparse(url_time)
        params = parse_qs(url_components.query)
-        return params.get("posted_time", [None])[0]
+        posted_time_str = params.get("posted_time", [None])[0]
        if posted_time_str:
            posted_date = datetime.strptime(
                posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
            ).date()
            return posted_date
        return None
    @staticmethod
    def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
@ -265,9 +282,9 @@ class ZipRecruiterScraper(Scraper):
                amount = amount.replace(",", "").strip("$ ").split(" ")[0]
                if "K" in amount:
                    amount = amount.replace("K", "")
-                    amount = float(amount) * 1000
+                    amount = int(float(amount)) * 1000
                else:
-                    amount = float(amount)
+                    amount = int(float(amount))
                amounts.append(amount)
            compensation = Compensation(
--- a/api/v1/jobs/init.py
+++ b/api/v1/jobs/init.py
@ -39,9 +39,8 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
        scraped_data: JobResponse = scraper.scrape(scraper_input)
        return (site.value, scraped_data)
-    with ThreadPoolExecutor() as executor:
+    with ThreadPoolExecutor(max_workers=3) as executor:
        results = dict(executor.map(scrape_site, scraper_input.site_type))
    scraper_response = CommonResponse(status="JSON response success", **results)
    if scraper_input.output_format == OutputFormat.CSV:
@ -56,11 +55,13 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
        csv_output = CSVFormatter.format(scraper_response)
        try:
            CSVFormatter.upload_to_google_sheet(csv_output)
-            return CommonResponse(status="Successfully uploaded to Google Sheets")
+            return CommonResponse(
                status="Successfully uploaded to Google Sheets", **results
            )
        except Exception as e:
            return CommonResponse(
-                status="Failed to upload to Google Sheet", error=repr(e)
+                status="Failed to upload to Google Sheet", error=repr(e), **results
            )
    else: