- full change

pull/31/head
Zachary Hampton 2023-09-02 13:50:12 -07:00
parent 9a86d2b1f5
commit 0fa92544c9
5 changed files with 77 additions and 37 deletions

View File

@ -1,6 +1,3 @@
import io
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from concurrent.futures import ThreadPoolExecutor
from .core.scrapers.indeed import IndeedScraper
@ -14,9 +11,10 @@ from .core.scrapers import (
OutputFormat,
CommonResponse,
)
from typing import List, Dict, Tuple, Union
router = APIRouter(prefix="/jobs", tags=["jobs"])
import pandas as pd
from .core.jobs import JobType
from typing import List, Dict, Tuple, Union
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
@ -24,15 +22,44 @@ SCRAPER_MAPPING = {
Site.ZIP_RECRUITER: ZipRecruiterScraper,
}
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
@router.post("/")
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
def scrape_jobs(
site_name: str | Site | List[Site],
search_term: str,
output_format: OutputFormat = OutputFormat.JSON,
location: str = "",
distance: int = None,
is_remote: bool = False,
job_type: JobType = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15
) -> pd.DataFrame:
"""
Asynchronously scrapes job data from multiple job sites.
:param scraper_input:
:return: scraper_response
"""
if type(site_name) == str:
site_name = _map_str_to_site(site_name)
site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput(
site_type=site_type,
search_term=search_term,
location=location,
distance=distance,
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
results_wanted=results_wanted,
output_format=output_format
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
@ -41,28 +68,19 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
with ThreadPoolExecutor(max_workers=3) as executor:
results = dict(executor.map(scrape_site, scraper_input.site_type))
scraper_response = CommonResponse(status="JSON response success", **results)
if scraper_input.output_format == OutputFormat.CSV:
csv_output = CSVFormatter.format(scraper_response)
response = StreamingResponse(csv_output, media_type="text/csv")
response.headers[
"Content-Disposition"
] = f"attachment; filename={CSVFormatter.generate_filename()}"
return response
df = pd.DataFrame()
elif scraper_input.output_format == OutputFormat.GSHEET:
csv_output = CSVFormatter.format(scraper_response)
try:
CSVFormatter.upload_to_google_sheet(csv_output)
return CommonResponse(
status="Successfully uploaded to Google Sheets", **results
)
for site in results:
for job in results[site].jobs:
data = job.json()
data_df = pd.read_json(data, typ='series')
data_df['site'] = site
#: concat
df = pd.concat([df, data_df], axis=1)
return df
except Exception as e:
return CommonResponse(
status="Failed to upload to Google Sheet", error=repr(e), **results
)
else:
return scraper_response

View File

@ -1,4 +1,4 @@
from ..jobs import *
from ..jobs import Enum, BaseModel, JobType, JobResponse
from ..formatters import OutputFormat
from typing import List, Dict, Optional, Any

View File

@ -51,16 +51,18 @@ class ZipRecruiterScraper(Scraper):
params = {
"search": scraper_input.search_term,
"location": scraper_input.location,
"radius": scraper_input.distance,
"refine_by_location_type": "only_remote"
if scraper_input.is_remote
else None,
"refine_by_employment": f"employment_type:employment_type:{job_type_value}"
if job_type_value
else None,
"page": page,
}
if scraper_input.is_remote:
params["refine_by_location_type"] = "only_remote"
if scraper_input.distance:
params["radius"] = scraper_input.distance
if job_type_value:
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
response = session.get(
self.url + "/jobs-search",
headers=ZipRecruiterScraper.headers(),
@ -70,7 +72,7 @@ class ZipRecruiterScraper(Scraper):
if response.status_code != status.HTTP_200_OK:
raise StatusException(response.status_code)
html_string = response.content
html_string = response.text
soup = BeautifulSoup(html_string, "html.parser")
if page == 1:

10
tests/test_indeed.py Normal file
View File

@ -0,0 +1,10 @@
from src import scrape_jobs
def test_indeed():
result = scrape_jobs(
site_name="indeed",
search_term="software engineer",
)
assert result is not None

View File

@ -0,0 +1,10 @@
from src import scrape_jobs
def test_ziprecruiter():
result = scrape_jobs(
site_name="zip_recruiter",
search_term="software engineer",
)
assert result is not None