- full change

pull/31/head
Zachary Hampton 2023-09-02 13:50:12 -07:00
parent 9a86d2b1f5
commit 0fa92544c9
5 changed files with 77 additions and 37 deletions

View File

@ -1,6 +1,3 @@
import io
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from .core.scrapers.indeed import IndeedScraper from .core.scrapers.indeed import IndeedScraper
@ -14,9 +11,10 @@ from .core.scrapers import (
OutputFormat, OutputFormat,
CommonResponse, CommonResponse,
) )
from typing import List, Dict, Tuple, Union
router = APIRouter(prefix="/jobs", tags=["jobs"]) import pandas as pd
from .core.jobs import JobType
from typing import List, Dict, Tuple, Union
SCRAPER_MAPPING = { SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper, Site.LINKEDIN: LinkedInScraper,
@ -24,15 +22,44 @@ SCRAPER_MAPPING = {
Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.ZIP_RECRUITER: ZipRecruiterScraper,
} }
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
@router.post("/")
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse: def scrape_jobs(
site_name: str | Site | List[Site],
search_term: str,
output_format: OutputFormat = OutputFormat.JSON,
location: str = "",
distance: int = None,
is_remote: bool = False,
job_type: JobType = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15
) -> pd.DataFrame:
""" """
Asynchronously scrapes job data from multiple job sites. Asynchronously scrapes job data from multiple job sites.
:param scraper_input: :param scraper_input:
:return: scraper_response :return: scraper_response
""" """
if type(site_name) == str:
site_name = _map_str_to_site(site_name)
site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput(
site_type=site_type,
search_term=search_term,
location=location,
distance=distance,
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
results_wanted=results_wanted,
output_format=output_format
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class() scraper = scraper_class()
@ -41,28 +68,19 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
with ThreadPoolExecutor(max_workers=3) as executor: with ThreadPoolExecutor(max_workers=3) as executor:
results = dict(executor.map(scrape_site, scraper_input.site_type)) results = dict(executor.map(scrape_site, scraper_input.site_type))
scraper_response = CommonResponse(status="JSON response success", **results)
if scraper_input.output_format == OutputFormat.CSV: df = pd.DataFrame()
csv_output = CSVFormatter.format(scraper_response)
response = StreamingResponse(csv_output, media_type="text/csv")
response.headers[
"Content-Disposition"
] = f"attachment; filename={CSVFormatter.generate_filename()}"
return response
elif scraper_input.output_format == OutputFormat.GSHEET: for site in results:
csv_output = CSVFormatter.format(scraper_response) for job in results[site].jobs:
try: data = job.json()
CSVFormatter.upload_to_google_sheet(csv_output)
return CommonResponse( data_df = pd.read_json(data, typ='series')
status="Successfully uploaded to Google Sheets", **results data_df['site'] = site
)
#: concat
df = pd.concat([df, data_df], axis=1)
return df
except Exception as e:
return CommonResponse(
status="Failed to upload to Google Sheet", error=repr(e), **results
)
else:
return scraper_response

View File

@ -1,4 +1,4 @@
from ..jobs import * from ..jobs import Enum, BaseModel, JobType, JobResponse
from ..formatters import OutputFormat from ..formatters import OutputFormat
from typing import List, Dict, Optional, Any from typing import List, Dict, Optional, Any

View File

@ -51,16 +51,18 @@ class ZipRecruiterScraper(Scraper):
params = { params = {
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"radius": scraper_input.distance,
"refine_by_location_type": "only_remote"
if scraper_input.is_remote
else None,
"refine_by_employment": f"employment_type:employment_type:{job_type_value}"
if job_type_value
else None,
"page": page, "page": page,
} }
if scraper_input.is_remote:
params["refine_by_location_type"] = "only_remote"
if scraper_input.distance:
params["radius"] = scraper_input.distance
if job_type_value:
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
response = session.get( response = session.get(
self.url + "/jobs-search", self.url + "/jobs-search",
headers=ZipRecruiterScraper.headers(), headers=ZipRecruiterScraper.headers(),
@ -70,7 +72,7 @@ class ZipRecruiterScraper(Scraper):
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
raise StatusException(response.status_code) raise StatusException(response.status_code)
html_string = response.content html_string = response.text
soup = BeautifulSoup(html_string, "html.parser") soup = BeautifulSoup(html_string, "html.parser")
if page == 1: if page == 1:

10
tests/test_indeed.py Normal file
View File

@ -0,0 +1,10 @@
from src import scrape_jobs
def test_indeed():
result = scrape_jobs(
site_name="indeed",
search_term="software engineer",
)
assert result is not None

View File

@ -0,0 +1,10 @@
from src import scrape_jobs
def test_ziprecruiter():
result = scrape_jobs(
site_name="zip_recruiter",
search_term="software engineer",
)
assert result is not None