Add Csv output (#20)

pull/22/head
Cullen Watson 2023-08-27 16:25:48 -05:00 committed by GitHub
parent 32a5bb37cd
commit 80a02faa75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 230 additions and 50 deletions

View File

@ -13,17 +13,19 @@ POST `/api/v1/jobs/`
### Request Schema
```plaintext
Request
├── Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
└── search_term (str)
└── Optional
{
Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
└── search_term (str)
Optional
├── location (int)
├── distance (int)
├── job_type (enum): fulltime, parttime, internship, contract
├── is_remote (bool)
├── results_wanted (int): per site_type
└── easy_apply (bool): only for linkedin
├── easy_apply (bool): only for linkedin
└── output_format (enum): json, csv
}
```
### Request Example
@ -40,8 +42,9 @@ Request
### Response Schema
```plaintext
site_type (enum)
└── response (SiteResponse)
{
site_type (enum): {
JobResponse
├── success (bool)
├── error (str)
├── jobs (List[JobPost])
@ -61,11 +64,15 @@ site_type (enum)
│ │ ├── max_amount (float)
│ │ └── currency (str): default is "US"
│ └── date_posted (datetime)
├── total_results (int)
└── returned_results (int)
}, ...
}
```
### Response Example
### Response Example (JSON)
```json
{
"indeed": {
@ -119,6 +126,12 @@ site_type (enum)
}
}
```
### Response Example (CSV)
```
Site, Title, Company Name, Job URL, Country, City, State, Job Type, Compensation Interval, Min Amount, Max Amount, Currency, Date Posted, Description
indeed, Software Engineer, INTEL, https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228, USA, Austin, TX, fulltime, yearly, 209760.0, 139480.0, USD, 2023-08-18T00:00:00, Job Description Designs...
linkedin, Software Engineer 1, Public Partnerships | PPL, https://www.linkedin.com/jobs/view/3690013792, USA, Austin, TX, , , , , , 2023-07-31T00:00:00, Public Partnerships LLC supports...
```
## Installation
_Python >= 3.10 required_

View File

@ -0,0 +1,6 @@
from enum import Enum
class OutputFormat(Enum):
CSV = "csv"
JSON = "json"

View File

@ -0,0 +1,74 @@
import csv
from io import StringIO
from datetime import datetime
from ...jobs import *
from ...scrapers import *
def generate_filename() -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"JobSpy_results_{timestamp}.csv"
class CSVFormatter:
@staticmethod
def format(jobs: ScraperResponse) -> StringIO:
"""
Transfomr the jobs objects into csv
:param jobs:
:return: csv
"""
output = StringIO()
writer = csv.writer(output)
headers = [
"Site",
"Title",
"Company Name",
"Job URL",
"Country",
"City",
"State",
"Job Type",
"Compensation Interval",
"Min Amount",
"Max Amount",
"Currency",
"Date Posted",
"Description",
]
writer.writerow(headers)
for site, job_response in jobs.dict().items():
if job_response and job_response.get("success"):
for job in job_response["jobs"]:
writer.writerow(
[
site,
job["title"],
job["company_name"],
job["job_url"],
job["location"]["country"],
job["location"]["city"],
job["location"]["state"],
job["job_type"].value if job.get("job_type") else "",
job["compensation"]["interval"].value
if job["compensation"]
else "",
job["compensation"]["min_amount"]
if job["compensation"]
else "",
job["compensation"]["max_amount"]
if job["compensation"]
else "",
job["compensation"]["currency"]
if job["compensation"]
else "",
job.get("date_posted", ""),
job["description"],
]
)
output.seek(0)
return output

View File

@ -1,5 +1,6 @@
from ..jobs import *
from typing import List
from ..formatters import OutputFormat
from typing import List, Dict, Optional
class StatusException(Exception):
@ -16,6 +17,7 @@ class Site(Enum):
class ScraperInput(BaseModel):
site_type: List[Site]
search_term: str
output_format: OutputFormat = OutputFormat.JSON
location: str = None
distance: int = None
@ -26,6 +28,12 @@ class ScraperInput(BaseModel):
results_wanted: int = 15
class ScraperResponse(BaseModel):
linkedin: Optional[JobResponse]
indeed: Optional[JobResponse]
zip_recruiter: Optional[JobResponse]
class Scraper:
def __init__(self, site: Site, url: str):
self.site = site

View File

@ -96,7 +96,9 @@ class ZipRecruiterScraper(Scraper):
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = ZipRecruiterScraper.get_description(job_url, session)
description, updated_job_url = ZipRecruiterScraper.get_description(
job_url, session
)
if updated_job_url is not None:
job_url = updated_job_url
if description is None:

View File

@ -1,11 +1,20 @@
from concurrent.futures import ThreadPoolExecutor
import io
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from concurrent.futures import ThreadPoolExecutor
from api.core.scrapers.indeed import IndeedScraper
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
from api.core.scrapers.linkedin import LinkedInScraper
from api.core.scrapers import ScraperInput, Site, JobResponse
from typing import List, Dict, Tuple
from api.core.formatters.csv import CSVFormatter, generate_filename
from api.core.scrapers import (
ScraperInput,
Site,
JobResponse,
OutputFormat,
ScraperResponse,
)
from typing import List, Dict, Tuple, Union
router = APIRouter(prefix="/jobs", tags=["jobs"])
@ -17,23 +26,31 @@ SCRAPER_MAPPING = {
@router.post("/")
async def scrape_jobs(scraper_input: ScraperInput) -> Dict[str, JobResponse]:
async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
"""
Asynchronously scrapes job data from multiple job sites.
:param scraper_input:
:return: Dict[str, JobResponse]: where each key is a site
:return: scraper_response
"""
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data = scraper.scrape(scraper_input)
scraped_data: JobResponse = scraper.scrape(scraper_input)
return (site.value, scraped_data)
with ThreadPoolExecutor() as executor:
resp_dict = {
site: resp
for site, resp in executor.map(scrape_site, scraper_input.site_type)
}
results = dict(executor.map(scrape_site, scraper_input.site_type))
return resp_dict
scraper_response = ScraperResponse(**results)
print(scraper_input.output_format)
if scraper_input.output_format == OutputFormat.CSV:
csv_output = CSVFormatter.format(scraper_response)
response = StreamingResponse(csv_output, media_type="text/csv")
response.headers[
"Content-Disposition"
] = f"attachment; filename={generate_filename()}"
return response
return scraper_response

View File

@ -10,6 +10,7 @@ app = FastAPI(
)
app.include_router(api_router)
@app.get("/health", tags=["health"])
async def health_check():
return {"message": "JobSpy ready to scrape"}

File diff suppressed because one or more lines are too long