Add Csv output (#20)

pull/22/head
Cullen Watson 2023-08-27 16:25:48 -05:00 committed by GitHub
parent 32a5bb37cd
commit 80a02faa75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 230 additions and 50 deletions

View File

@ -13,17 +13,19 @@ POST `/api/v1/jobs/`
### Request Schema ### Request Schema
```plaintext ```plaintext
Request {
├── Required Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed ├── site_type (List[enum]): linkedin, zip_recruiter, indeed
└── search_term (str) └── search_term (str)
└── Optional Optional
├── location (int) ├── location (int)
├── distance (int) ├── distance (int)
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── is_remote (bool) ├── is_remote (bool)
├── results_wanted (int): per site_type ├── results_wanted (int): per site_type
└── easy_apply (bool): only for linkedin ├── easy_apply (bool): only for linkedin
└── output_format (enum): json, csv
}
``` ```
### Request Example ### Request Example
@ -40,32 +42,37 @@ Request
### Response Schema ### Response Schema
```plaintext ```plaintext
site_type (enum) {
└── response (SiteResponse) site_type (enum): {
├── success (bool) JobResponse
├── error (str) ├── success (bool)
├── jobs (List[JobPost]) ├── error (str)
│ └── JobPost ├── jobs (List[JobPost])
│ ├── title (str) │ └── JobPost
│ ├── company_name (str) │ ├── title (str)
│ ├── job_url (str) │ ├── company_name (str)
│ ├── location (object) │ ├── job_url (str)
│ │ ├── country (str) │ ├── location (object)
│ │ ├── city (str) │ │ ├── country (str)
│ │ ├── state (str) │ │ ├── city (str)
│ ├── description (str) │ │ ├── state (str)
│ ├── job_type (enum) │ ├── description (str)
│ ├── compensation (object) │ ├── job_type (enum)
│ │ ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly │ ├── compensation (object)
│ │ ├── min_amount (float) │ │ ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly
│ │ ├── max_amount (float) │ │ ├── min_amount (float)
│ │ └── currency (str): default is "US" │ │ ├── max_amount (float)
│ └── date_posted (datetime) │ │ └── currency (str): default is "US"
├── total_results (int) │ └── date_posted (datetime)
└── returned_results (int)
├── total_results (int)
└── returned_results (int)
}, ...
}
``` ```
### Response Example ### Response Example (JSON)
```json ```json
{ {
"indeed": { "indeed": {
@ -119,6 +126,12 @@ site_type (enum)
} }
} }
``` ```
### Response Example (CSV)
```
Site, Title, Company Name, Job URL, Country, City, State, Job Type, Compensation Interval, Min Amount, Max Amount, Currency, Date Posted, Description
indeed, Software Engineer, INTEL, https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228, USA, Austin, TX, fulltime, yearly, 209760.0, 139480.0, USD, 2023-08-18T00:00:00, Job Description Designs...
linkedin, Software Engineer 1, Public Partnerships | PPL, https://www.linkedin.com/jobs/view/3690013792, USA, Austin, TX, , , , , , 2023-07-31T00:00:00, Public Partnerships LLC supports...
```
## Installation ## Installation
_Python >= 3.10 required_ _Python >= 3.10 required_

View File

@ -0,0 +1,6 @@
from enum import Enum
class OutputFormat(Enum):
CSV = "csv"
JSON = "json"

View File

@ -0,0 +1,74 @@
import csv
from io import StringIO
from datetime import datetime
from ...jobs import *
from ...scrapers import *
def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
class CSVFormatter:
@staticmethod
def format(jobs: ScraperResponse) -> StringIO:
"""
Transfomr the jobs objects into csv
:param jobs:
:return: csv
"""
output = StringIO()
writer = csv.writer(output)
headers = [
"Site",
"Title",
"Company Name",
"Job URL",
"Country",
"City",
"State",
"Job Type",
"Compensation Interval",
"Min Amount",
"Max Amount",
"Currency",
"Date Posted",
"Description",
]
writer.writerow(headers)
for site, job_response in jobs.dict().items():
if job_response and job_response.get("success"):
for job in job_response["jobs"]:
writer.writerow(
[
site,
job["title"],
job["company_name"],
job["job_url"],
job["location"]["country"],
job["location"]["city"],
job["location"]["state"],
job["job_type"].value if job.get("job_type") else "",
job["compensation"]["interval"].value
if job["compensation"]
else "",
job["compensation"]["min_amount"]
if job["compensation"]
else "",
job["compensation"]["max_amount"]
if job["compensation"]
else "",
job["compensation"]["currency"]
if job["compensation"]
else "",
job.get("date_posted", ""),
job["description"],
]
)
output.seek(0)
return output

View File

@ -1,5 +1,6 @@
from ..jobs import * from ..jobs import *
from typing import List from ..formatters import OutputFormat
from typing import List, Dict, Optional
class StatusException(Exception): class StatusException(Exception):
@ -16,6 +17,7 @@ class Site(Enum):
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
site_type: List[Site] site_type: List[Site]
search_term: str search_term: str
output_format: OutputFormat = OutputFormat.JSON
location: str = None location: str = None
distance: int = None distance: int = None
@ -26,6 +28,12 @@ class ScraperInput(BaseModel):
results_wanted: int = 15 results_wanted: int = 15
class ScraperResponse(BaseModel):
linkedin: Optional[JobResponse]
indeed: Optional[JobResponse]
zip_recruiter: Optional[JobResponse]
class Scraper: class Scraper:
def __init__(self, site: Site, url: str): def __init__(self, site: Site, url: str):
self.site = site self.site = site

View File

@ -96,7 +96,9 @@ class ZipRecruiterScraper(Scraper):
title = job.find("h2", {"class": "title"}).text title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip() company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = ZipRecruiterScraper.get_description(job_url, session) description, updated_job_url = ZipRecruiterScraper.get_description(
job_url, session
)
if updated_job_url is not None: if updated_job_url is not None:
job_url = updated_job_url job_url = updated_job_url
if description is None: if description is None:

View File

@ -1,11 +1,20 @@
from concurrent.futures import ThreadPoolExecutor import io
from fastapi import APIRouter from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from concurrent.futures import ThreadPoolExecutor
from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.indeed import IndeedScraper
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
from api.core.scrapers.linkedin import LinkedInScraper from api.core.scrapers.linkedin import LinkedInScraper
from api.core.scrapers import ScraperInput, Site, JobResponse from api.core.formatters.csv import CSVFormatter, generate_filename
from typing import List, Dict, Tuple from api.core.scrapers import (
ScraperInput,
Site,
JobResponse,
OutputFormat,
ScraperResponse,
)
from typing import List, Dict, Tuple, Union
router = APIRouter(prefix="/jobs", tags=["jobs"]) router = APIRouter(prefix="/jobs", tags=["jobs"])
@ -17,23 +26,31 @@ SCRAPER_MAPPING = {
@router.post("/") @router.post("/")
async def scrape_jobs(scraper_input: ScraperInput) -> Dict[str, JobResponse]: async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
""" """
Asynchronously scrapes job data from multiple job sites. Asynchronously scrapes job data from multiple job sites.
:param scraper_input: :param scraper_input:
:return: Dict[str, JobResponse]: where each key is a site :return: scraper_response
""" """
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class() scraper = scraper_class()
scraped_data = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
return (site.value, scraped_data) return (site.value, scraped_data)
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
resp_dict = { results = dict(executor.map(scrape_site, scraper_input.site_type))
site: resp
for site, resp in executor.map(scrape_site, scraper_input.site_type)
}
return resp_dict scraper_response = ScraperResponse(**results)
print(scraper_input.output_format)
if scraper_input.output_format == OutputFormat.CSV:
csv_output = CSVFormatter.format(scraper_response)
response = StreamingResponse(csv_output, media_type="text/csv")
response.headers[
"Content-Disposition"
] = f"attachment; filename={generate_filename()}"
return response
return scraper_response

View File

@ -10,6 +10,7 @@ app = FastAPI(
) )
app.include_router(api_router) app.include_router(api_router)
@app.get("/health", tags=["health"]) @app.get("/health", tags=["health"])
async def health_check(): async def health_check():
return {"message": "JobSpy ready to scrape"} return {"message": "JobSpy ready to scrape"}

File diff suppressed because one or more lines are too long