mirror of https://github.com/Bunsly/JobSpy
Add Csv output (#20)
parent
32a5bb37cd
commit
80a02faa75
31
README.md
31
README.md
|
@ -13,17 +13,19 @@ POST `/api/v1/jobs/`
|
|||
### Request Schema
|
||||
|
||||
```plaintext
|
||||
Request
|
||||
├── Required
|
||||
│ ├── site_type (List[enum]): linkedin, zip_recruiter, indeed
|
||||
│ └── search_term (str)
|
||||
└── Optional
|
||||
{
|
||||
Required
|
||||
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
|
||||
└── search_term (str)
|
||||
Optional
|
||||
├── location (int)
|
||||
├── distance (int)
|
||||
├── job_type (enum): fulltime, parttime, internship, contract
|
||||
├── is_remote (bool)
|
||||
├── results_wanted (int): per site_type
|
||||
└── easy_apply (bool): only for linkedin
|
||||
├── easy_apply (bool): only for linkedin
|
||||
└── output_format (enum): json, csv
|
||||
}
|
||||
```
|
||||
|
||||
### Request Example
|
||||
|
@ -40,8 +42,9 @@ Request
|
|||
|
||||
### Response Schema
|
||||
```plaintext
|
||||
site_type (enum)
|
||||
└── response (SiteResponse)
|
||||
{
|
||||
site_type (enum): {
|
||||
JobResponse
|
||||
├── success (bool)
|
||||
├── error (str)
|
||||
├── jobs (List[JobPost])
|
||||
|
@ -61,11 +64,15 @@ site_type (enum)
|
|||
│ │ ├── max_amount (float)
|
||||
│ │ └── currency (str): default is "US"
|
||||
│ └── date_posted (datetime)
|
||||
│
|
||||
├── total_results (int)
|
||||
└── returned_results (int)
|
||||
}, ...
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### Response Example
|
||||
### Response Example (JSON)
|
||||
```json
|
||||
{
|
||||
"indeed": {
|
||||
|
@ -119,6 +126,12 @@ site_type (enum)
|
|||
}
|
||||
}
|
||||
```
|
||||
### Response Example (CSV)
|
||||
```
|
||||
Site, Title, Company Name, Job URL, Country, City, State, Job Type, Compensation Interval, Min Amount, Max Amount, Currency, Date Posted, Description
|
||||
indeed, Software Engineer, INTEL, https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228, USA, Austin, TX, fulltime, yearly, 209760.0, 139480.0, USD, 2023-08-18T00:00:00, Job Description Designs...
|
||||
linkedin, Software Engineer 1, Public Partnerships | PPL, https://www.linkedin.com/jobs/view/3690013792, USA, Austin, TX, , , , , , 2023-07-31T00:00:00, Public Partnerships LLC supports...
|
||||
```
|
||||
|
||||
## Installation
|
||||
_Python >= 3.10 required_
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class OutputFormat(Enum):
|
||||
CSV = "csv"
|
||||
JSON = "json"
|
|
@ -0,0 +1,74 @@
|
|||
import csv
|
||||
from io import StringIO
|
||||
from datetime import datetime
|
||||
|
||||
from ...jobs import *
|
||||
from ...scrapers import *
|
||||
|
||||
|
||||
def generate_filename() -> str:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
return f"JobSpy_results_{timestamp}.csv"
|
||||
|
||||
|
||||
class CSVFormatter:
|
||||
@staticmethod
|
||||
def format(jobs: ScraperResponse) -> StringIO:
|
||||
"""
|
||||
Transfomr the jobs objects into csv
|
||||
:param jobs:
|
||||
:return: csv
|
||||
"""
|
||||
output = StringIO()
|
||||
writer = csv.writer(output)
|
||||
|
||||
headers = [
|
||||
"Site",
|
||||
"Title",
|
||||
"Company Name",
|
||||
"Job URL",
|
||||
"Country",
|
||||
"City",
|
||||
"State",
|
||||
"Job Type",
|
||||
"Compensation Interval",
|
||||
"Min Amount",
|
||||
"Max Amount",
|
||||
"Currency",
|
||||
"Date Posted",
|
||||
"Description",
|
||||
]
|
||||
writer.writerow(headers)
|
||||
|
||||
for site, job_response in jobs.dict().items():
|
||||
if job_response and job_response.get("success"):
|
||||
for job in job_response["jobs"]:
|
||||
writer.writerow(
|
||||
[
|
||||
site,
|
||||
job["title"],
|
||||
job["company_name"],
|
||||
job["job_url"],
|
||||
job["location"]["country"],
|
||||
job["location"]["city"],
|
||||
job["location"]["state"],
|
||||
job["job_type"].value if job.get("job_type") else "",
|
||||
job["compensation"]["interval"].value
|
||||
if job["compensation"]
|
||||
else "",
|
||||
job["compensation"]["min_amount"]
|
||||
if job["compensation"]
|
||||
else "",
|
||||
job["compensation"]["max_amount"]
|
||||
if job["compensation"]
|
||||
else "",
|
||||
job["compensation"]["currency"]
|
||||
if job["compensation"]
|
||||
else "",
|
||||
job.get("date_posted", ""),
|
||||
job["description"],
|
||||
]
|
||||
)
|
||||
|
||||
output.seek(0)
|
||||
return output
|
|
@ -1,5 +1,6 @@
|
|||
from ..jobs import *
|
||||
from typing import List
|
||||
from ..formatters import OutputFormat
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
|
||||
class StatusException(Exception):
|
||||
|
@ -16,6 +17,7 @@ class Site(Enum):
|
|||
class ScraperInput(BaseModel):
|
||||
site_type: List[Site]
|
||||
search_term: str
|
||||
output_format: OutputFormat = OutputFormat.JSON
|
||||
|
||||
location: str = None
|
||||
distance: int = None
|
||||
|
@ -26,6 +28,12 @@ class ScraperInput(BaseModel):
|
|||
results_wanted: int = 15
|
||||
|
||||
|
||||
class ScraperResponse(BaseModel):
|
||||
linkedin: Optional[JobResponse]
|
||||
indeed: Optional[JobResponse]
|
||||
zip_recruiter: Optional[JobResponse]
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, site: Site, url: str):
|
||||
self.site = site
|
||||
|
|
|
@ -96,7 +96,9 @@ class ZipRecruiterScraper(Scraper):
|
|||
title = job.find("h2", {"class": "title"}).text
|
||||
company = job.find("a", {"class": "company_name"}).text.strip()
|
||||
|
||||
description, updated_job_url = ZipRecruiterScraper.get_description(job_url, session)
|
||||
description, updated_job_url = ZipRecruiterScraper.get_description(
|
||||
job_url, session
|
||||
)
|
||||
if updated_job_url is not None:
|
||||
job_url = updated_job_url
|
||||
if description is None:
|
||||
|
|
|
@ -1,11 +1,20 @@
|
|||
from concurrent.futures import ThreadPoolExecutor
|
||||
import io
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import StreamingResponse
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from api.core.scrapers.indeed import IndeedScraper
|
||||
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
||||
from api.core.scrapers.linkedin import LinkedInScraper
|
||||
from api.core.scrapers import ScraperInput, Site, JobResponse
|
||||
from typing import List, Dict, Tuple
|
||||
from api.core.formatters.csv import CSVFormatter, generate_filename
|
||||
from api.core.scrapers import (
|
||||
ScraperInput,
|
||||
Site,
|
||||
JobResponse,
|
||||
OutputFormat,
|
||||
ScraperResponse,
|
||||
)
|
||||
from typing import List, Dict, Tuple, Union
|
||||
|
||||
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
||||
|
||||
|
@ -17,23 +26,31 @@ SCRAPER_MAPPING = {
|
|||
|
||||
|
||||
@router.post("/")
|
||||
async def scrape_jobs(scraper_input: ScraperInput) -> Dict[str, JobResponse]:
|
||||
async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
|
||||
"""
|
||||
Asynchronously scrapes job data from multiple job sites.
|
||||
:param scraper_input:
|
||||
:return: Dict[str, JobResponse]: where each key is a site
|
||||
:return: scraper_response
|
||||
"""
|
||||
|
||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||
scraper_class = SCRAPER_MAPPING[site]
|
||||
scraper = scraper_class()
|
||||
scraped_data = scraper.scrape(scraper_input)
|
||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||
return (site.value, scraped_data)
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
resp_dict = {
|
||||
site: resp
|
||||
for site, resp in executor.map(scrape_site, scraper_input.site_type)
|
||||
}
|
||||
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
||||
|
||||
return resp_dict
|
||||
scraper_response = ScraperResponse(**results)
|
||||
|
||||
print(scraper_input.output_format)
|
||||
if scraper_input.output_format == OutputFormat.CSV:
|
||||
csv_output = CSVFormatter.format(scraper_response)
|
||||
response = StreamingResponse(csv_output, media_type="text/csv")
|
||||
response.headers[
|
||||
"Content-Disposition"
|
||||
] = f"attachment; filename={generate_filename()}"
|
||||
return response
|
||||
|
||||
return scraper_response
|
||||
|
|
1
main.py
1
main.py
|
@ -10,6 +10,7 @@ app = FastAPI(
|
|||
)
|
||||
app.include_router(api_router)
|
||||
|
||||
|
||||
@app.get("/health", tags=["health"])
|
||||
async def health_check():
|
||||
return {"message": "JobSpy ready to scrape"}
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue