From 7285ca7108211b3368185f642d977f59a8cc5567 Mon Sep 17 00:00:00 2001 From: Cullen Watson <cullen@cullenwatson.com> Date: Sat, 26 Aug 2023 14:25:52 -0500 Subject: [PATCH] feat(scraper): threading per scraper type --- README.md | 2 +- api/auth/db_utils.py | 3 ++- api/v1/jobs/__init__.py | 12 +++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 14dc31b..ad62882 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ - **distance**: int - **job_type**: str - Options: `fulltime`, `parttime`, `internship`, `contract` - **is_remote**: bool -- **results_wanted**: int + - **results_wanted**: int (per `site_type`) - **easy_apply**: bool (Only for LinkedIn) ### Example diff --git a/api/auth/db_utils.py b/api/auth/db_utils.py index bca7f4c..696513a 100644 --- a/api/auth/db_utils.py +++ b/api/auth/db_utils.py @@ -8,7 +8,8 @@ from api.core.users import UserInDB from settings import SUPABASE_URL, SUPABASE_KEY pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") -supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) +if SUPABASE_URL: + supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) def create_user(user_create: UserInDB): diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index a78862d..f45cfe8 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -1,3 +1,4 @@ +from concurrent.futures import ThreadPoolExecutor from fastapi import APIRouter from api.core.scrapers.indeed import IndeedScraper @@ -16,12 +17,13 @@ SCRAPER_MAPPING = { @router.post("/", response_model=List[JobResponse]) -async def scrape_jobs(scraper_input: ScraperInput) -> JobResponse: - resp = [] - for site in scraper_input.site_type: +async def scrape_jobs(scraper_input: ScraperInput) -> List[JobResponse]: + def scrape_site(site: str) -> JobResponse: scraper_class = SCRAPER_MAPPING[site] scraper = scraper_class() - job_response = scraper.scrape(scraper_input) - resp.append(job_response) + return scraper.scrape(scraper_input) + + with ThreadPoolExecutor() as executor: + resp = list(executor.map(scrape_site, scraper_input.site_type)) return resp