Add Csv output (#20)

2026-03-04 19:44:30 -08:00 · 2023-08-27 16:25:48 -05:00
parent 32a5bb37cd
commit 80a02faa75
8 changed files with 230 additions and 50 deletions
--- a/README.md
+++ b/README.md
@@ -13,17 +13,19 @@ POST `/api/v1/jobs/`
 ### Request Schema

 ```plaintext
-Request
-├── Required
-│   ├── site_type (List[enum]): linkedin, zip_recruiter, indeed
-│   └── search_term (str)
-└── Optional
+{
+    Required
+    ├── site_type (List[enum]): linkedin, zip_recruiter, indeed
+    └── search_term (str)
+    Optional
    ├── location (int)
    ├── distance (int)
    ├── job_type (enum): fulltime, parttime, internship, contract
    ├── is_remote (bool)
    ├── results_wanted (int): per site_type
-    └── easy_apply (bool): only for linkedin
+    ├── easy_apply (bool): only for linkedin
+    └── output_format (enum): json, csv
+}
 ```

 ### Request Example
@@ -40,32 +42,37 @@ Request

 ### Response Schema
 ```plaintext
-site_type (enum)
-└── response (SiteResponse)
-    ├── success (bool)
-    ├── error (str)
-    ├── jobs (List[JobPost])
-    │   └── JobPost
-    │       ├── title (str)
-    │       ├── company_name (str)
-    │       ├── job_url (str)
-    │       ├── location (object)
-    │       │   ├── country (str)
-    │       │   ├── city (str)
-    │       │   ├── state (str)
-    │       ├── description (str)
-    │       ├── job_type (enum)
-    │       ├── compensation (object)
-    │       │   ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly
-    │       │   ├── min_amount (float)
-    │       │   ├── max_amount (float)
-    │       │   └── currency (str): default is "US"
-    │       └── date_posted (datetime)
-    ├── total_results (int)
-    └── returned_results (int)
+{
+    site_type (enum): {
+        JobResponse
+        ├── success (bool)
+        ├── error (str)
+        ├── jobs (List[JobPost])
+        │   └── JobPost
+        │       ├── title (str)
+        │       ├── company_name (str)
+        │       ├── job_url (str)
+        │       ├── location (object)
+        │       │   ├── country (str)
+        │       │   ├── city (str)
+        │       │   ├── state (str)
+        │       ├── description (str)
+        │       ├── job_type (enum)
+        │       ├── compensation (object)
+        │       │   ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly
+        │       │   ├── min_amount (float)
+        │       │   ├── max_amount (float)
+        │       │   └── currency (str): default is "US"
+        │       └── date_posted (datetime)
+        │
+        ├── total_results (int)
+        └── returned_results (int)
+    }, ...
+}
+
 ```

-### Response Example
+### Response Example (JSON)
 ```json
 {
    "indeed": {
@@ -119,6 +126,12 @@ site_type (enum)
    }
 }
 ```
+### Response Example (CSV)
+```
+Site, Title, Company Name, Job URL, Country, City, State, Job Type, Compensation Interval, Min Amount, Max Amount, Currency, Date Posted, Description
+indeed, Software Engineer, INTEL, https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228, USA, Austin, TX, fulltime, yearly, 209760.0, 139480.0, USD, 2023-08-18T00:00:00, Job Description Designs...
+linkedin, Software Engineer 1, Public Partnerships | PPL, https://www.linkedin.com/jobs/view/3690013792, USA, Austin, TX, , , , , , 2023-07-31T00:00:00, Public Partnerships LLC supports...
+```

 ## Installation
 _Python >= 3.10 required_  
--- a/api/core/formatters/init.py
+++ b/api/core/formatters/init.py
@@ -0,0 +1,6 @@
+from enum import Enum
+
+
+class OutputFormat(Enum):
+    CSV = "csv"
+    JSON = "json"
--- a/api/core/formatters/csv/init.py
+++ b/api/core/formatters/csv/init.py
@@ -0,0 +1,74 @@
+import csv
+from io import StringIO
+from datetime import datetime
+
+from ...jobs import *
+from ...scrapers import *
+
+
+def generate_filename() -> str:
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return f"JobSpy_results_{timestamp}.csv"
+
+
+class CSVFormatter:
+    @staticmethod
+    def format(jobs: ScraperResponse) -> StringIO:
+        """
+        Transfomr the jobs objects into csv
+        :param jobs:
+        :return: csv
+        """
+        output = StringIO()
+        writer = csv.writer(output)
+
+        headers = [
+            "Site",
+            "Title",
+            "Company Name",
+            "Job URL",
+            "Country",
+            "City",
+            "State",
+            "Job Type",
+            "Compensation Interval",
+            "Min Amount",
+            "Max Amount",
+            "Currency",
+            "Date Posted",
+            "Description",
+        ]
+        writer.writerow(headers)
+
+        for site, job_response in jobs.dict().items():
+            if job_response and job_response.get("success"):
+                for job in job_response["jobs"]:
+                    writer.writerow(
+                        [
+                            site,
+                            job["title"],
+                            job["company_name"],
+                            job["job_url"],
+                            job["location"]["country"],
+                            job["location"]["city"],
+                            job["location"]["state"],
+                            job["job_type"].value if job.get("job_type") else "",
+                            job["compensation"]["interval"].value
+                            if job["compensation"]
+                            else "",
+                            job["compensation"]["min_amount"]
+                            if job["compensation"]
+                            else "",
+                            job["compensation"]["max_amount"]
+                            if job["compensation"]
+                            else "",
+                            job["compensation"]["currency"]
+                            if job["compensation"]
+                            else "",
+                            job.get("date_posted", ""),
+                            job["description"],
+                        ]
+                    )
+
+        output.seek(0)
+        return output
--- a/api/core/scrapers/init.py
+++ b/api/core/scrapers/init.py
@@ -1,5 +1,6 @@
 from ..jobs import *
-from typing import List
+from ..formatters import OutputFormat
+from typing import List, Dict, Optional


 class StatusException(Exception):
@@ -16,6 +17,7 @@ class Site(Enum):
 class ScraperInput(BaseModel):
    site_type: List[Site]
    search_term: str
+    output_format: OutputFormat = OutputFormat.JSON

    location: str = None
    distance: int = None
@@ -26,6 +28,12 @@ class ScraperInput(BaseModel):
    results_wanted: int = 15


+class ScraperResponse(BaseModel):
+    linkedin: Optional[JobResponse]
+    indeed: Optional[JobResponse]
+    zip_recruiter: Optional[JobResponse]
+
+
 class Scraper:
    def __init__(self, site: Site, url: str):
        self.site = site
--- a/api/core/scrapers/ziprecruiter/init.py
+++ b/api/core/scrapers/ziprecruiter/init.py
@@ -96,7 +96,9 @@ class ZipRecruiterScraper(Scraper):
            title = job.find("h2", {"class": "title"}).text
            company = job.find("a", {"class": "company_name"}).text.strip()

-            description, updated_job_url = ZipRecruiterScraper.get_description(job_url, session)
+            description, updated_job_url = ZipRecruiterScraper.get_description(
+                job_url, session
+            )
            if updated_job_url is not None:
                job_url = updated_job_url
            if description is None:
--- a/api/v1/jobs/init.py
+++ b/api/v1/jobs/init.py
@@ -1,11 +1,20 @@
-from concurrent.futures import ThreadPoolExecutor
+import io
 from fastapi import APIRouter
+from fastapi.responses import StreamingResponse
+from concurrent.futures import ThreadPoolExecutor

 from api.core.scrapers.indeed import IndeedScraper
 from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
 from api.core.scrapers.linkedin import LinkedInScraper
-from api.core.scrapers import ScraperInput, Site, JobResponse
-from typing import List, Dict, Tuple
+from api.core.formatters.csv import CSVFormatter, generate_filename
+from api.core.scrapers import (
+    ScraperInput,
+    Site,
+    JobResponse,
+    OutputFormat,
+    ScraperResponse,
+)
+from typing import List, Dict, Tuple, Union

 router = APIRouter(prefix="/jobs", tags=["jobs"])

@@ -17,23 +26,31 @@ SCRAPER_MAPPING = {


@router.post("/")
-async def scrape_jobs(scraper_input: ScraperInput) -> Dict[str, JobResponse]:
+async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
    """
    Asynchronously scrapes job data from multiple job sites.
    :param scraper_input:
-    :return: Dict[str, JobResponse]: where each key is a site
+    :return: scraper_response
    """

    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class()
-        scraped_data = scraper.scrape(scraper_input)
+        scraped_data: JobResponse = scraper.scrape(scraper_input)
        return (site.value, scraped_data)

    with ThreadPoolExecutor() as executor:
-        resp_dict = {
-            site: resp
-            for site, resp in executor.map(scrape_site, scraper_input.site_type)
-        }
+        results = dict(executor.map(scrape_site, scraper_input.site_type))

-    return resp_dict
+    scraper_response = ScraperResponse(**results)
+
+    print(scraper_input.output_format)
+    if scraper_input.output_format == OutputFormat.CSV:
+        csv_output = CSVFormatter.format(scraper_response)
+        response = StreamingResponse(csv_output, media_type="text/csv")
+        response.headers[
+            "Content-Disposition"
+        ] = f"attachment; filename={generate_filename()}"
+        return response
+
+    return scraper_response
--- a/main.py
+++ b/main.py
@@ -10,6 +10,7 @@ app = FastAPI(
 )
 app.include_router(api_router)

+
@app.get("/health", tags=["health"])
 async def health_check():
    return {"message": "JobSpy ready to scrape"}
--- a/postman/JobSpy.postman_collection.json
+++ b/postman/JobSpy.postman_collection.json