JobSpy/api/core/formatters/csv/__init__.py

import gspread
from oauth2client.service_account import ServiceAccountCredentials

import csv
from io import StringIO
from datetime import datetime

from ...jobs import *
from ...scrapers import *
from settings import *


class CSVFormatter:
    @staticmethod
    def fetch_job_urls(credentials: Any) -> set:
        """
        Fetches all the job urls from the google sheet to prevent duplicates
        :param credentials:
        :return: urls
        """
        try:
            gc = gspread.authorize(credentials)
            sh = gc.open(GSHEET_NAME)

            worksheet = sh.get_worksheet(0)
            data = worksheet.get_all_values()
            job_urls = set()
            for row in data[1:]:
                job_urls.add(row[3])
            return job_urls
        except Exception as e:
            raise e

    @staticmethod
    def upload_to_google_sheet(csv_data: str):
        """
        Appends rows to google sheet
        :param csv_data:
        :return:
        """
        try:
            scope = [
                "https://www.googleapis.com/auth/spreadsheets",
                "https://www.googleapis.com/auth/drive.file",
                "https://www.googleapis.com/auth/drive",
            ]
            credentials = ServiceAccountCredentials.from_json_keyfile_name(
                "client_secret.json", scope
            )
            gc = gspread.authorize(credentials)
            sh = gc.open(GSHEET_NAME)

            worksheet = sh.get_worksheet(0)
            data_string = csv_data.getvalue()
            reader = csv.reader(StringIO(data_string))

            job_urls = CSVFormatter.fetch_job_urls(credentials)

            rows = list(reader)

            for i, row in enumerate(rows):
                if i == 0:
                    continue
                if row[4] in job_urls:
                    continue

                row[6] = format(int(row[6]), ",d") if row[6] else ""
                row[7] = format(int(row[7]), ",d") if row[7] else ""
                worksheet.append_row(row)
        except Exception as e:
            raise e

    @staticmethod
    def generate_filename() -> str:
        """
        Adds a timestamp to the filename header
        :return: filename
        """
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"JobSpy_results_{timestamp}.csv"

    @staticmethod
    def format(jobs: CommonResponse) -> StringIO:
        """
        Transfomr the jobs objects into csv
        :param jobs:
        :return: csv
        """
        output = StringIO()
        writer = csv.writer(output)

        headers = [
            "Title",
            "Company Name",
            "City",
            "State",
            "Job Type",
            "Pay Cycle",
            "Min Amount",
            "Max Amount",
            "Date Posted",
            "Description",
            "Job URL",
        ]
        writer.writerow(headers)

        for site, job_response in jobs.dict().items():
            if isinstance(job_response, dict) and job_response.get("success"):
                for job in job_response["jobs"]:
                    writer.writerow(
                        [
                            job["title"],
                            job["company_name"],
                            job["location"]["city"],
                            job["location"]["state"],
                            job["job_type"].value if job.get("job_type") else "",
                            job["compensation"]["interval"].value
                            if job["compensation"]
                            else "",
                            job["compensation"]["min_amount"]
                            if job["compensation"]
                            else "",
                            job["compensation"]["max_amount"]
                            if job["compensation"]
                            else "",
                            job.get("date_posted", ""),
                            job["description"],
                            job["job_url"],
                        ]
                    )

        output.seek(0)
        return output