2023-08-27 14:25:48 -07:00
|
|
|
import io
|
2023-07-10 20:07:19 -07:00
|
|
|
from fastapi import APIRouter
|
2023-08-27 14:25:48 -07:00
|
|
|
from fastapi.responses import StreamingResponse
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
2023-07-06 16:44:38 -07:00
|
|
|
|
2023-07-07 19:00:59 -07:00
|
|
|
from api.core.scrapers.indeed import IndeedScraper
|
2023-07-08 04:57:36 -07:00
|
|
|
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
2023-07-08 07:34:55 -07:00
|
|
|
from api.core.scrapers.linkedin import LinkedInScraper
|
2023-08-27 18:32:46 -07:00
|
|
|
from api.core.formatters.csv import CSVFormatter
|
2023-08-27 14:25:48 -07:00
|
|
|
from api.core.scrapers import (
|
|
|
|
ScraperInput,
|
|
|
|
Site,
|
|
|
|
JobResponse,
|
|
|
|
OutputFormat,
|
2023-08-27 18:32:46 -07:00
|
|
|
CommonResponse,
|
2023-08-27 14:25:48 -07:00
|
|
|
)
|
|
|
|
from typing import List, Dict, Tuple, Union
|
2023-07-07 19:00:59 -07:00
|
|
|
|
2023-08-19 16:46:03 -07:00
|
|
|
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
2023-07-07 19:00:59 -07:00
|
|
|
|
2023-07-08 19:36:08 -07:00
|
|
|
SCRAPER_MAPPING = {
|
|
|
|
Site.LINKEDIN: LinkedInScraper,
|
|
|
|
Site.INDEED: IndeedScraper,
|
|
|
|
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
|
|
|
}
|
2023-07-07 19:00:59 -07:00
|
|
|
|
2023-07-08 07:34:55 -07:00
|
|
|
|
2023-08-26 18:30:00 -07:00
|
|
|
@router.post("/")
|
2023-08-27 18:32:46 -07:00
|
|
|
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
2023-08-26 18:30:00 -07:00
|
|
|
"""
|
|
|
|
Asynchronously scrapes job data from multiple job sites.
|
|
|
|
:param scraper_input:
|
2023-08-27 14:25:48 -07:00
|
|
|
:return: scraper_response
|
2023-08-26 18:30:00 -07:00
|
|
|
"""
|
2023-08-26 20:09:04 -07:00
|
|
|
|
2023-08-26 18:30:00 -07:00
|
|
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
2023-08-26 12:28:02 -07:00
|
|
|
scraper_class = SCRAPER_MAPPING[site]
|
|
|
|
scraper = scraper_class()
|
2023-08-27 14:25:48 -07:00
|
|
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
2023-08-26 18:30:00 -07:00
|
|
|
return (site.value, scraped_data)
|
2023-07-08 07:34:55 -07:00
|
|
|
|
2023-08-26 12:28:02 -07:00
|
|
|
with ThreadPoolExecutor() as executor:
|
2023-08-27 14:25:48 -07:00
|
|
|
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
|
|
|
|
2023-08-27 18:32:46 -07:00
|
|
|
scraper_response = CommonResponse(status="JSON response success", **results)
|
2023-08-27 14:25:48 -07:00
|
|
|
|
|
|
|
if scraper_input.output_format == OutputFormat.CSV:
|
|
|
|
csv_output = CSVFormatter.format(scraper_response)
|
|
|
|
response = StreamingResponse(csv_output, media_type="text/csv")
|
|
|
|
response.headers[
|
|
|
|
"Content-Disposition"
|
2023-08-27 18:32:46 -07:00
|
|
|
] = f"attachment; filename={CSVFormatter.generate_filename()}"
|
2023-08-27 14:25:48 -07:00
|
|
|
return response
|
2023-08-26 18:30:00 -07:00
|
|
|
|
2023-08-27 18:32:46 -07:00
|
|
|
elif scraper_input.output_format == OutputFormat.GSHEET:
|
|
|
|
csv_output = CSVFormatter.format(scraper_response)
|
|
|
|
try:
|
|
|
|
CSVFormatter.upload_to_google_sheet(csv_output)
|
|
|
|
return CommonResponse(status="Successfully uploaded to Google Sheets")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
return CommonResponse(
|
2023-08-28 03:45:08 -07:00
|
|
|
status="Failed to upload to Google Sheet", error=repr(e)
|
2023-08-27 18:32:46 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
else:
|
|
|
|
return scraper_response
|