mirror of https://github.com/Bunsly/JobSpy
Add Csv output (#20)
parent
32a5bb37cd
commit
80a02faa75
31
README.md
31
README.md
|
@ -13,17 +13,19 @@ POST `/api/v1/jobs/`
|
||||||
### Request Schema
|
### Request Schema
|
||||||
|
|
||||||
```plaintext
|
```plaintext
|
||||||
Request
|
{
|
||||||
├── Required
|
Required
|
||||||
│ ├── site_type (List[enum]): linkedin, zip_recruiter, indeed
|
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
|
||||||
│ └── search_term (str)
|
└── search_term (str)
|
||||||
└── Optional
|
Optional
|
||||||
├── location (int)
|
├── location (int)
|
||||||
├── distance (int)
|
├── distance (int)
|
||||||
├── job_type (enum): fulltime, parttime, internship, contract
|
├── job_type (enum): fulltime, parttime, internship, contract
|
||||||
├── is_remote (bool)
|
├── is_remote (bool)
|
||||||
├── results_wanted (int): per site_type
|
├── results_wanted (int): per site_type
|
||||||
└── easy_apply (bool): only for linkedin
|
├── easy_apply (bool): only for linkedin
|
||||||
|
└── output_format (enum): json, csv
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Request Example
|
### Request Example
|
||||||
|
@ -40,8 +42,9 @@ Request
|
||||||
|
|
||||||
### Response Schema
|
### Response Schema
|
||||||
```plaintext
|
```plaintext
|
||||||
site_type (enum)
|
{
|
||||||
└── response (SiteResponse)
|
site_type (enum): {
|
||||||
|
JobResponse
|
||||||
├── success (bool)
|
├── success (bool)
|
||||||
├── error (str)
|
├── error (str)
|
||||||
├── jobs (List[JobPost])
|
├── jobs (List[JobPost])
|
||||||
|
@ -61,11 +64,15 @@ site_type (enum)
|
||||||
│ │ ├── max_amount (float)
|
│ │ ├── max_amount (float)
|
||||||
│ │ └── currency (str): default is "US"
|
│ │ └── currency (str): default is "US"
|
||||||
│ └── date_posted (datetime)
|
│ └── date_posted (datetime)
|
||||||
|
│
|
||||||
├── total_results (int)
|
├── total_results (int)
|
||||||
└── returned_results (int)
|
└── returned_results (int)
|
||||||
|
}, ...
|
||||||
|
}
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Response Example
|
### Response Example (JSON)
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"indeed": {
|
"indeed": {
|
||||||
|
@ -119,6 +126,12 @@ site_type (enum)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
### Response Example (CSV)
|
||||||
|
```
|
||||||
|
Site, Title, Company Name, Job URL, Country, City, State, Job Type, Compensation Interval, Min Amount, Max Amount, Currency, Date Posted, Description
|
||||||
|
indeed, Software Engineer, INTEL, https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228, USA, Austin, TX, fulltime, yearly, 209760.0, 139480.0, USD, 2023-08-18T00:00:00, Job Description Designs...
|
||||||
|
linkedin, Software Engineer 1, Public Partnerships | PPL, https://www.linkedin.com/jobs/view/3690013792, USA, Austin, TX, , , , , , 2023-07-31T00:00:00, Public Partnerships LLC supports...
|
||||||
|
```
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
_Python >= 3.10 required_
|
_Python >= 3.10 required_
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class OutputFormat(Enum):
|
||||||
|
CSV = "csv"
|
||||||
|
JSON = "json"
|
|
@ -0,0 +1,74 @@
|
||||||
|
import csv
|
||||||
|
from io import StringIO
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from ...jobs import *
|
||||||
|
from ...scrapers import *
|
||||||
|
|
||||||
|
|
||||||
|
def generate_filename() -> str:
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
return f"JobSpy_results_{timestamp}.csv"
|
||||||
|
|
||||||
|
|
||||||
|
class CSVFormatter:
|
||||||
|
@staticmethod
|
||||||
|
def format(jobs: ScraperResponse) -> StringIO:
|
||||||
|
"""
|
||||||
|
Transfomr the jobs objects into csv
|
||||||
|
:param jobs:
|
||||||
|
:return: csv
|
||||||
|
"""
|
||||||
|
output = StringIO()
|
||||||
|
writer = csv.writer(output)
|
||||||
|
|
||||||
|
headers = [
|
||||||
|
"Site",
|
||||||
|
"Title",
|
||||||
|
"Company Name",
|
||||||
|
"Job URL",
|
||||||
|
"Country",
|
||||||
|
"City",
|
||||||
|
"State",
|
||||||
|
"Job Type",
|
||||||
|
"Compensation Interval",
|
||||||
|
"Min Amount",
|
||||||
|
"Max Amount",
|
||||||
|
"Currency",
|
||||||
|
"Date Posted",
|
||||||
|
"Description",
|
||||||
|
]
|
||||||
|
writer.writerow(headers)
|
||||||
|
|
||||||
|
for site, job_response in jobs.dict().items():
|
||||||
|
if job_response and job_response.get("success"):
|
||||||
|
for job in job_response["jobs"]:
|
||||||
|
writer.writerow(
|
||||||
|
[
|
||||||
|
site,
|
||||||
|
job["title"],
|
||||||
|
job["company_name"],
|
||||||
|
job["job_url"],
|
||||||
|
job["location"]["country"],
|
||||||
|
job["location"]["city"],
|
||||||
|
job["location"]["state"],
|
||||||
|
job["job_type"].value if job.get("job_type") else "",
|
||||||
|
job["compensation"]["interval"].value
|
||||||
|
if job["compensation"]
|
||||||
|
else "",
|
||||||
|
job["compensation"]["min_amount"]
|
||||||
|
if job["compensation"]
|
||||||
|
else "",
|
||||||
|
job["compensation"]["max_amount"]
|
||||||
|
if job["compensation"]
|
||||||
|
else "",
|
||||||
|
job["compensation"]["currency"]
|
||||||
|
if job["compensation"]
|
||||||
|
else "",
|
||||||
|
job.get("date_posted", ""),
|
||||||
|
job["description"],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
output.seek(0)
|
||||||
|
return output
|
|
@ -1,5 +1,6 @@
|
||||||
from ..jobs import *
|
from ..jobs import *
|
||||||
from typing import List
|
from ..formatters import OutputFormat
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
|
||||||
class StatusException(Exception):
|
class StatusException(Exception):
|
||||||
|
@ -16,6 +17,7 @@ class Site(Enum):
|
||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: List[Site]
|
site_type: List[Site]
|
||||||
search_term: str
|
search_term: str
|
||||||
|
output_format: OutputFormat = OutputFormat.JSON
|
||||||
|
|
||||||
location: str = None
|
location: str = None
|
||||||
distance: int = None
|
distance: int = None
|
||||||
|
@ -26,6 +28,12 @@ class ScraperInput(BaseModel):
|
||||||
results_wanted: int = 15
|
results_wanted: int = 15
|
||||||
|
|
||||||
|
|
||||||
|
class ScraperResponse(BaseModel):
|
||||||
|
linkedin: Optional[JobResponse]
|
||||||
|
indeed: Optional[JobResponse]
|
||||||
|
zip_recruiter: Optional[JobResponse]
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, site: Site, url: str):
|
def __init__(self, site: Site, url: str):
|
||||||
self.site = site
|
self.site = site
|
||||||
|
|
|
@ -96,7 +96,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
title = job.find("h2", {"class": "title"}).text
|
title = job.find("h2", {"class": "title"}).text
|
||||||
company = job.find("a", {"class": "company_name"}).text.strip()
|
company = job.find("a", {"class": "company_name"}).text.strip()
|
||||||
|
|
||||||
description, updated_job_url = ZipRecruiterScraper.get_description(job_url, session)
|
description, updated_job_url = ZipRecruiterScraper.get_description(
|
||||||
|
job_url, session
|
||||||
|
)
|
||||||
if updated_job_url is not None:
|
if updated_job_url is not None:
|
||||||
job_url = updated_job_url
|
job_url = updated_job_url
|
||||||
if description is None:
|
if description is None:
|
||||||
|
|
|
@ -1,11 +1,20 @@
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
import io
|
||||||
from fastapi import APIRouter
|
from fastapi import APIRouter
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from api.core.scrapers.indeed import IndeedScraper
|
from api.core.scrapers.indeed import IndeedScraper
|
||||||
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from api.core.scrapers.linkedin import LinkedInScraper
|
from api.core.scrapers.linkedin import LinkedInScraper
|
||||||
from api.core.scrapers import ScraperInput, Site, JobResponse
|
from api.core.formatters.csv import CSVFormatter, generate_filename
|
||||||
from typing import List, Dict, Tuple
|
from api.core.scrapers import (
|
||||||
|
ScraperInput,
|
||||||
|
Site,
|
||||||
|
JobResponse,
|
||||||
|
OutputFormat,
|
||||||
|
ScraperResponse,
|
||||||
|
)
|
||||||
|
from typing import List, Dict, Tuple, Union
|
||||||
|
|
||||||
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
||||||
|
|
||||||
|
@ -17,23 +26,31 @@ SCRAPER_MAPPING = {
|
||||||
|
|
||||||
|
|
||||||
@router.post("/")
|
@router.post("/")
|
||||||
async def scrape_jobs(scraper_input: ScraperInput) -> Dict[str, JobResponse]:
|
async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrapes job data from multiple job sites.
|
Asynchronously scrapes job data from multiple job sites.
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
:return: Dict[str, JobResponse]: where each key is a site
|
:return: scraper_response
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
scraped_data = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
return (site.value, scraped_data)
|
return (site.value, scraped_data)
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
resp_dict = {
|
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
||||||
site: resp
|
|
||||||
for site, resp in executor.map(scrape_site, scraper_input.site_type)
|
|
||||||
}
|
|
||||||
|
|
||||||
return resp_dict
|
scraper_response = ScraperResponse(**results)
|
||||||
|
|
||||||
|
print(scraper_input.output_format)
|
||||||
|
if scraper_input.output_format == OutputFormat.CSV:
|
||||||
|
csv_output = CSVFormatter.format(scraper_response)
|
||||||
|
response = StreamingResponse(csv_output, media_type="text/csv")
|
||||||
|
response.headers[
|
||||||
|
"Content-Disposition"
|
||||||
|
] = f"attachment; filename={generate_filename()}"
|
||||||
|
return response
|
||||||
|
|
||||||
|
return scraper_response
|
||||||
|
|
1
main.py
1
main.py
|
@ -10,6 +10,7 @@ app = FastAPI(
|
||||||
)
|
)
|
||||||
app.include_router(api_router)
|
app.include_router(api_router)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health", tags=["health"])
|
@app.get("/health", tags=["health"])
|
||||||
async def health_check():
|
async def health_check():
|
||||||
return {"message": "JobSpy ready to scrape"}
|
return {"message": "JobSpy ready to scrape"}
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue