JobSpy/api/core/formatters/csv/__init__.py

import gspread
from oauth2client.service_account import ServiceAccountCredentials

import csv
from io import StringIO
from datetime import datetime

from ...jobs import *
from ...scrapers import *
from settings import *


class CSVFormatter:
    @staticmethod
    def fetch_job_urls(credentials: Any) -> set:
        """
        Fetches all the job urls from the google sheet to prevent duplicates
        :param credentials:
        :return: urls
        """
        try:
            gc = gspread.authorize(credentials)
            sh = gc.open(GSHEET_NAME)

            worksheet = sh.get_worksheet(0)
            data = worksheet.get_all_values()
            job_urls = set()
            for row in data[1:]:
                job_urls.add(row[3])
            return job_urls
        except Exception as e:
            raise e

    @staticmethod
    def upload_to_google_sheet(csv_data: str):
        """
        Appends rows to google sheet
        :param csv_data:
        :return:
        """
        try:
            scope = [
                "https://www.googleapis.com/auth/spreadsheets",
                "https://www.googleapis.com/auth/drive.file",
                "https://www.googleapis.com/auth/drive",
            ]
            credentials = ServiceAccountCredentials.from_json_keyfile_name(
                "client_secret.json", scope
            )
            gc = gspread.authorize(credentials)
            sh = gc.open(GSHEET_NAME)

            worksheet = sh.get_worksheet(0)
            data_string = csv_data.getvalue()
            reader = csv.reader(StringIO(data_string))

            job_urls = CSVFormatter.fetch_job_urls(credentials)

            rows = list(reader)

            for i, row in enumerate(rows):
                if i == 0:
                    continue
                if row[4] in job_urls:
                    continue

                row[6] = format(int(row[6]), ",d") if row[6] else ""
                row[7] = format(int(row[7]), ",d") if row[7] else ""
                worksheet.append_row(row)
        except Exception as e:
            raise e

    @staticmethod
    def generate_filename() -> str:
        """
        Adds a timestamp to the filename header
        :return: filename
        """
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"JobSpy_results_{timestamp}.csv"

    @staticmethod
    def format(jobs: CommonResponse) -> StringIO:
        """
        Transfomr the jobs objects into csv
        :param jobs:
        :return: csv
        """
        output = StringIO()
        writer = csv.writer(output)

        headers = [
            "Title",
            "Company Name",
            "City",
            "State",
            "Job Type",
            "Pay Cycle",
            "Min Amount",
            "Max Amount",
            "Date Posted",
            "Description",
            "Job URL",
        ]
        writer.writerow(headers)

        for site, job_response in jobs.dict().items():
            if isinstance(job_response, dict) and job_response.get("success"):
                for job in job_response["jobs"]:
                    writer.writerow(
                        [
                            job["title"],
                            job["company_name"],
                            job["location"]["city"],
                            job["location"]["state"],
                            job["job_type"].value if job.get("job_type") else "",
                            job["compensation"]["interval"].value
                            if job["compensation"]
                            else "",
                            job["compensation"]["min_amount"]
                            if job["compensation"]
                            else "",
                            job["compensation"]["max_amount"]
                            if job["compensation"]
                            else "",
                            job.get("date_posted", ""),
                            job["description"],
                            job["job_url"],
                        ]
                    )

        output.seek(0)
        return output
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`import gspread`
			`from oauth2client.service_account import ServiceAccountCredentials`

Add Csv output (#20) 2023-08-27 14:25:48 -07:00			`import csv`
			`from io import StringIO`
			`from datetime import datetime`

			`from ...jobs import *`
			`from ...scrapers import *`
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`from settings import *`
Add Csv output (#20) 2023-08-27 14:25:48 -07:00

Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`class CSVFormatter:`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`@staticmethod`
			`def fetch_job_urls(credentials: Any) -> set:`
			`"""`
			`Fetches all the job urls from the google sheet to prevent duplicates`
			`:param credentials:`
			`:return: urls`
			`"""`
			`try:`
			`gc = gspread.authorize(credentials)`
			`sh = gc.open(GSHEET_NAME)`

			`worksheet = sh.get_worksheet(0)`
			`data = worksheet.get_all_values()`
			`job_urls = set()`
			`for row in data[1:]:`
			`job_urls.add(row[3])`
			`return job_urls`
			`except Exception as e:`
			`raise e`

Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`@staticmethod`
			`def upload_to_google_sheet(csv_data: str):`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`"""`
			`Appends rows to google sheet`
			`:param csv_data:`
			`:return:`
			`"""`
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`try:`
			`scope = [`
			`"https://www.googleapis.com/auth/spreadsheets",`
			`"https://www.googleapis.com/auth/drive.file",`
			`"https://www.googleapis.com/auth/drive",`
			`]`
			`credentials = ServiceAccountCredentials.from_json_keyfile_name(`
Docker workflow (#24) 2023-08-28 10:15:13 -07:00			`"client_secret.json", scope`
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`)`
			`gc = gspread.authorize(credentials)`
			`sh = gc.open(GSHEET_NAME)`
Add Csv output (#20) 2023-08-27 14:25:48 -07:00
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`worksheet = sh.get_worksheet(0)`
			`data_string = csv_data.getvalue()`
			`reader = csv.reader(StringIO(data_string))`

remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`job_urls = CSVFormatter.fetch_job_urls(credentials)`

Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`rows = list(reader)`

			`for i, row in enumerate(rows):`
			`if i == 0:`
			`continue`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`if row[4] in job_urls:`
			`continue`

			`row[6] = format(int(row[6]), ",d") if row[6] else ""`
			`row[7] = format(int(row[7]), ",d") if row[7] else ""`
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`worksheet.append_row(row)`
			`except Exception as e:`
			`raise e`

			`@staticmethod`
			`def generate_filename() -> str:`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`"""`
			`Adds a timestamp to the filename header`
			`:return: filename`
			`"""`
Docker workflow (#24) 2023-08-28 10:15:13 -07:00			`timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`return f"JobSpy_results_{timestamp}.csv"`

Add Csv output (#20) 2023-08-27 14:25:48 -07:00			`@staticmethod`
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`def format(jobs: CommonResponse) -> StringIO:`
Add Csv output (#20) 2023-08-27 14:25:48 -07:00			`"""`
			`Transfomr the jobs objects into csv`
			`:param jobs:`
			`:return: csv`
			`"""`
			`output = StringIO()`
			`writer = csv.writer(output)`

			`headers = [`
			`"Title",`
			`"Company Name",`
			`"City",`
			`"State",`
			`"Job Type",`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`"Pay Cycle",`
Add Csv output (#20) 2023-08-27 14:25:48 -07:00			`"Min Amount",`
			`"Max Amount",`
			`"Date Posted",`
			`"Description",`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`"Job URL",`
Add Csv output (#20) 2023-08-27 14:25:48 -07:00			`]`
			`writer.writerow(headers)`

			`for site, job_response in jobs.dict().items():`
Google sheets integration (#22) 2023-08-27 18:32:46 -07:00			`if isinstance(job_response, dict) and job_response.get("success"):`
Add Csv output (#20) 2023-08-27 14:25:48 -07:00			`for job in job_response["jobs"]:`
			`writer.writerow(`
			`[`
			`job["title"],`
			`job["company_name"],`
			`job["location"]["city"],`
			`job["location"]["state"],`
			`job["job_type"].value if job.get("job_type") else "",`
			`job["compensation"]["interval"].value`
			`if job["compensation"]`
			`else "",`
			`job["compensation"]["min_amount"]`
			`if job["compensation"]`
			`else "",`
			`job["compensation"]["max_amount"]`
			`if job["compensation"]`
			`else "",`
			`job.get("date_posted", ""),`
			`job["description"],`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`job["job_url"],`
Add Csv output (#20) 2023-08-27 14:25:48 -07:00			`]`
			`)`

			`output.seek(0)`
			`return output`