From 0fa92544c91b95f2f102e3cc5681cb39988cebbd Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Sat, 2 Sep 2023 13:50:12 -0700 Subject: [PATCH] - full change --- src/__init__.py | 74 ++++++++++++++-------- src/core/scrapers/__init__.py | 2 +- src/core/scrapers/ziprecruiter/__init__.py | 18 +++--- tests/test_indeed.py | 10 +++ tests/test_ziprecruiter.py | 10 +++ 5 files changed, 77 insertions(+), 37 deletions(-) create mode 100644 tests/test_indeed.py create mode 100644 tests/test_ziprecruiter.py diff --git a/src/__init__.py b/src/__init__.py index 6213f49..be0c650 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,6 +1,3 @@ -import io -from fastapi import APIRouter -from fastapi.responses import StreamingResponse from concurrent.futures import ThreadPoolExecutor from .core.scrapers.indeed import IndeedScraper @@ -14,9 +11,10 @@ from .core.scrapers import ( OutputFormat, CommonResponse, ) -from typing import List, Dict, Tuple, Union -router = APIRouter(prefix="/jobs", tags=["jobs"]) +import pandas as pd +from .core.jobs import JobType +from typing import List, Dict, Tuple, Union SCRAPER_MAPPING = { Site.LINKEDIN: LinkedInScraper, @@ -24,15 +22,44 @@ SCRAPER_MAPPING = { Site.ZIP_RECRUITER: ZipRecruiterScraper, } +def _map_str_to_site(site_name: str) -> Site: + return Site[site_name.upper()] -@router.post("/") -async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse: + +def scrape_jobs( + site_name: str | Site | List[Site], + search_term: str, + + output_format: OutputFormat = OutputFormat.JSON, + location: str = "", + distance: int = None, + is_remote: bool = False, + job_type: JobType = None, + easy_apply: bool = False, # linkedin + results_wanted: int = 15 +) -> pd.DataFrame: """ Asynchronously scrapes job data from multiple job sites. :param scraper_input: :return: scraper_response """ + if type(site_name) == str: + site_name = _map_str_to_site(site_name) + + site_type = [site_name] if type(site_name) == Site else site_name + scraper_input = ScraperInput( + site_type=site_type, + search_term=search_term, + location=location, + distance=distance, + is_remote=is_remote, + job_type=job_type, + easy_apply=easy_apply, + results_wanted=results_wanted, + output_format=output_format + ) + def scrape_site(site: Site) -> Tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] scraper = scraper_class() @@ -41,28 +68,19 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse: with ThreadPoolExecutor(max_workers=3) as executor: results = dict(executor.map(scrape_site, scraper_input.site_type)) - scraper_response = CommonResponse(status="JSON response success", **results) - if scraper_input.output_format == OutputFormat.CSV: - csv_output = CSVFormatter.format(scraper_response) - response = StreamingResponse(csv_output, media_type="text/csv") - response.headers[ - "Content-Disposition" - ] = f"attachment; filename={CSVFormatter.generate_filename()}" - return response + df = pd.DataFrame() - elif scraper_input.output_format == OutputFormat.GSHEET: - csv_output = CSVFormatter.format(scraper_response) - try: - CSVFormatter.upload_to_google_sheet(csv_output) - return CommonResponse( - status="Successfully uploaded to Google Sheets", **results - ) + for site in results: + for job in results[site].jobs: + data = job.json() + + data_df = pd.read_json(data, typ='series') + data_df['site'] = site + + #: concat + df = pd.concat([df, data_df], axis=1) + + return df - except Exception as e: - return CommonResponse( - status="Failed to upload to Google Sheet", error=repr(e), **results - ) - else: - return scraper_response diff --git a/src/core/scrapers/__init__.py b/src/core/scrapers/__init__.py index 876e9a9..7ed8e78 100644 --- a/src/core/scrapers/__init__.py +++ b/src/core/scrapers/__init__.py @@ -1,4 +1,4 @@ -from ..jobs import * +from ..jobs import Enum, BaseModel, JobType, JobResponse from ..formatters import OutputFormat from typing import List, Dict, Optional, Any diff --git a/src/core/scrapers/ziprecruiter/__init__.py b/src/core/scrapers/ziprecruiter/__init__.py index cf329dd..89b3d09 100644 --- a/src/core/scrapers/ziprecruiter/__init__.py +++ b/src/core/scrapers/ziprecruiter/__init__.py @@ -51,16 +51,18 @@ class ZipRecruiterScraper(Scraper): params = { "search": scraper_input.search_term, "location": scraper_input.location, - "radius": scraper_input.distance, - "refine_by_location_type": "only_remote" - if scraper_input.is_remote - else None, - "refine_by_employment": f"employment_type:employment_type:{job_type_value}" - if job_type_value - else None, "page": page, } + if scraper_input.is_remote: + params["refine_by_location_type"] = "only_remote" + + if scraper_input.distance: + params["radius"] = scraper_input.distance + + if job_type_value: + params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}" + response = session.get( self.url + "/jobs-search", headers=ZipRecruiterScraper.headers(), @@ -70,7 +72,7 @@ class ZipRecruiterScraper(Scraper): if response.status_code != status.HTTP_200_OK: raise StatusException(response.status_code) - html_string = response.content + html_string = response.text soup = BeautifulSoup(html_string, "html.parser") if page == 1: diff --git a/tests/test_indeed.py b/tests/test_indeed.py new file mode 100644 index 0000000..25dd8a0 --- /dev/null +++ b/tests/test_indeed.py @@ -0,0 +1,10 @@ +from src import scrape_jobs + + +def test_indeed(): + result = scrape_jobs( + site_name="indeed", + search_term="software engineer", + ) + + assert result is not None diff --git a/tests/test_ziprecruiter.py b/tests/test_ziprecruiter.py new file mode 100644 index 0000000..eb56693 --- /dev/null +++ b/tests/test_ziprecruiter.py @@ -0,0 +1,10 @@ +from src import scrape_jobs + + +def test_ziprecruiter(): + result = scrape_jobs( + site_name="zip_recruiter", + search_term="software engineer", + ) + + assert result is not None \ No newline at end of file