- full change

2023-09-02 13:50:12 -07:00 · 2023-09-02 13:50:12 -07:00 · 0fa92544c9
parent 9a86d2b1f5
commit 0fa92544c9
5 changed files with 77 additions and 37 deletions
--- a/src/init.py
+++ b/src/init.py
@ -1,6 +1,3 @@
 import io
 from fastapi import APIRouter
 from fastapi.responses import StreamingResponse
 from concurrent.futures import ThreadPoolExecutor
 from .core.scrapers.indeed import IndeedScraper
@ -14,9 +11,10 @@ from .core.scrapers import (
    OutputFormat,
    CommonResponse,
 )
 from typing import List, Dict, Tuple, Union
-router = APIRouter(prefix="/jobs", tags=["jobs"])
+import pandas as pd
 from .core.jobs import JobType
 from typing import List, Dict, Tuple, Union
 SCRAPER_MAPPING = {
    Site.LINKEDIN: LinkedInScraper,
@ -24,15 +22,44 @@ SCRAPER_MAPPING = {
    Site.ZIP_RECRUITER: ZipRecruiterScraper,
 }
 def _map_str_to_site(site_name: str) -> Site:
    return Site[site_name.upper()]
-@router.post("/")
+
-async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
+def scrape_jobs(
        site_name: str | Site | List[Site],
        search_term: str,
        output_format: OutputFormat = OutputFormat.JSON,
        location: str = "",
        distance: int = None,
        is_remote: bool = False,
        job_type: JobType = None,
        easy_apply: bool = False,  # linkedin
        results_wanted: int = 15
 ) -> pd.DataFrame:
    """
    Asynchronously scrapes job data from multiple job sites.
    :param scraper_input:
    :return: scraper_response
    """
    if type(site_name) == str:
        site_name = _map_str_to_site(site_name)
    site_type = [site_name] if type(site_name) == Site else site_name
    scraper_input = ScraperInput(
        site_type=site_type,
        search_term=search_term,
        location=location,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
        easy_apply=easy_apply,
        results_wanted=results_wanted,
        output_format=output_format
    )
    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class()
@ -41,28 +68,19 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
    with ThreadPoolExecutor(max_workers=3) as executor:
        results = dict(executor.map(scrape_site, scraper_input.site_type))
    scraper_response = CommonResponse(status="JSON response success", **results)
-    if scraper_input.output_format == OutputFormat.CSV:
+    df = pd.DataFrame()
        csv_output = CSVFormatter.format(scraper_response)
        response = StreamingResponse(csv_output, media_type="text/csv")
        response.headers[
            "Content-Disposition"
        ] = f"attachment; filename={CSVFormatter.generate_filename()}"
        return response
-    elif scraper_input.output_format == OutputFormat.GSHEET:
+    for site in results:
-        csv_output = CSVFormatter.format(scraper_response)
+        for job in results[site].jobs:
-        try:
+            data = job.json()
-            CSVFormatter.upload_to_google_sheet(csv_output)
+
-            return CommonResponse(
+            data_df = pd.read_json(data, typ='series')
-                status="Successfully uploaded to Google Sheets", **results
+            data_df['site'] = site
-            )
+
            #: concat
            df = pd.concat([df, data_df], axis=1)
    return df
        except Exception as e:
            return CommonResponse(
                status="Failed to upload to Google Sheet", error=repr(e), **results
            )
    else:
        return scraper_response
--- a/src/core/scrapers/init.py
+++ b/src/core/scrapers/init.py
@ -1,4 +1,4 @@
-from ..jobs import *
+from ..jobs import Enum, BaseModel, JobType, JobResponse
 from ..formatters import OutputFormat
 from typing import List, Dict, Optional, Any
--- a/src/core/scrapers/ziprecruiter/init.py
+++ b/src/core/scrapers/ziprecruiter/init.py
@ -51,16 +51,18 @@ class ZipRecruiterScraper(Scraper):
        params = {
            "search": scraper_input.search_term,
            "location": scraper_input.location,
            "radius": scraper_input.distance,
            "refine_by_location_type": "only_remote"
            if scraper_input.is_remote
            else None,
            "refine_by_employment": f"employment_type:employment_type:{job_type_value}"
            if job_type_value
            else None,
            "page": page,
        }
        if scraper_input.is_remote:
            params["refine_by_location_type"] = "only_remote"
        if scraper_input.distance:
            params["radius"] = scraper_input.distance
        if job_type_value:
            params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
        response = session.get(
            self.url + "/jobs-search",
            headers=ZipRecruiterScraper.headers(),
@ -70,7 +72,7 @@ class ZipRecruiterScraper(Scraper):
        if response.status_code != status.HTTP_200_OK:
            raise StatusException(response.status_code)
-        html_string = response.content
+        html_string = response.text
        soup = BeautifulSoup(html_string, "html.parser")
        if page == 1:
--- a/tests/test_indeed.py
+++ b/tests/test_indeed.py
@ -0,0 +1,10 @@
 from src import scrape_jobs
 def test_indeed():
    result = scrape_jobs(
        site_name="indeed",
        search_term="software engineer",
    )
    assert result is not None
--- a/tests/test_ziprecruiter.py
+++ b/tests/test_ziprecruiter.py
@ -0,0 +1,10 @@
 from src import scrape_jobs
 def test_ziprecruiter():
    result = scrape_jobs(
        site_name="zip_recruiter",
        search_term="software engineer",
    )
    assert result is not None