JobSpy/api/v1/jobs/__init__.py

69 lines
2.2 KiB
Python
Raw Normal View History

2023-08-27 14:25:48 -07:00
import io
from fastapi import APIRouter
2023-08-27 14:25:48 -07:00
from fastapi.responses import StreamingResponse
from concurrent.futures import ThreadPoolExecutor
2023-07-06 16:44:38 -07:00
2023-07-07 19:00:59 -07:00
from api.core.scrapers.indeed import IndeedScraper
2023-07-08 04:57:36 -07:00
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
2023-07-08 07:34:55 -07:00
from api.core.scrapers.linkedin import LinkedInScraper
2023-08-27 18:32:46 -07:00
from api.core.formatters.csv import CSVFormatter
2023-08-27 14:25:48 -07:00
from api.core.scrapers import (
ScraperInput,
Site,
JobResponse,
OutputFormat,
2023-08-27 18:32:46 -07:00
CommonResponse,
2023-08-27 14:25:48 -07:00
)
from typing import List, Dict, Tuple, Union
2023-07-07 19:00:59 -07:00
2023-08-19 16:46:03 -07:00
router = APIRouter(prefix="/jobs", tags=["jobs"])
2023-07-07 19:00:59 -07:00
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
}
2023-07-07 19:00:59 -07:00
2023-07-08 07:34:55 -07:00
2023-08-26 18:30:00 -07:00
@router.post("/")
2023-08-27 18:32:46 -07:00
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
2023-08-26 18:30:00 -07:00
"""
Asynchronously scrapes job data from multiple job sites.
:param scraper_input:
2023-08-27 14:25:48 -07:00
:return: scraper_response
2023-08-26 18:30:00 -07:00
"""
2023-08-26 18:30:00 -07:00
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
2023-08-27 14:25:48 -07:00
scraped_data: JobResponse = scraper.scrape(scraper_input)
2023-08-26 18:30:00 -07:00
return (site.value, scraped_data)
2023-07-08 07:34:55 -07:00
2023-08-31 08:29:43 -07:00
with ThreadPoolExecutor(max_workers=3) as executor:
2023-08-27 14:25:48 -07:00
results = dict(executor.map(scrape_site, scraper_input.site_type))
2023-08-27 18:32:46 -07:00
scraper_response = CommonResponse(status="JSON response success", **results)
2023-08-27 14:25:48 -07:00
if scraper_input.output_format == OutputFormat.CSV:
csv_output = CSVFormatter.format(scraper_response)
response = StreamingResponse(csv_output, media_type="text/csv")
response.headers[
"Content-Disposition"
2023-08-27 18:32:46 -07:00
] = f"attachment; filename={CSVFormatter.generate_filename()}"
2023-08-27 14:25:48 -07:00
return response
2023-08-26 18:30:00 -07:00
2023-08-27 18:32:46 -07:00
elif scraper_input.output_format == OutputFormat.GSHEET:
csv_output = CSVFormatter.format(scraper_response)
try:
CSVFormatter.upload_to_google_sheet(csv_output)
2023-08-31 08:29:43 -07:00
return CommonResponse(
status="Successfully uploaded to Google Sheets", **results
)
2023-08-27 18:32:46 -07:00
except Exception as e:
return CommonResponse(
2023-08-31 08:29:43 -07:00
status="Failed to upload to Google Sheet", error=repr(e), **results
2023-08-27 18:32:46 -07:00
)
else:
return scraper_response