mirror of https://github.com/Bunsly/JobSpy
- full change
parent
9a86d2b1f5
commit
0fa92544c9
|
@ -1,6 +1,3 @@
|
|||
import io
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import StreamingResponse
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from .core.scrapers.indeed import IndeedScraper
|
||||
|
@ -14,9 +11,10 @@ from .core.scrapers import (
|
|||
OutputFormat,
|
||||
CommonResponse,
|
||||
)
|
||||
from typing import List, Dict, Tuple, Union
|
||||
|
||||
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
||||
import pandas as pd
|
||||
from .core.jobs import JobType
|
||||
from typing import List, Dict, Tuple, Union
|
||||
|
||||
SCRAPER_MAPPING = {
|
||||
Site.LINKEDIN: LinkedInScraper,
|
||||
|
@ -24,15 +22,44 @@ SCRAPER_MAPPING = {
|
|||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||
}
|
||||
|
||||
def _map_str_to_site(site_name: str) -> Site:
|
||||
return Site[site_name.upper()]
|
||||
|
||||
@router.post("/")
|
||||
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
||||
|
||||
def scrape_jobs(
|
||||
site_name: str | Site | List[Site],
|
||||
search_term: str,
|
||||
|
||||
output_format: OutputFormat = OutputFormat.JSON,
|
||||
location: str = "",
|
||||
distance: int = None,
|
||||
is_remote: bool = False,
|
||||
job_type: JobType = None,
|
||||
easy_apply: bool = False, # linkedin
|
||||
results_wanted: int = 15
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Asynchronously scrapes job data from multiple job sites.
|
||||
:param scraper_input:
|
||||
:return: scraper_response
|
||||
"""
|
||||
|
||||
if type(site_name) == str:
|
||||
site_name = _map_str_to_site(site_name)
|
||||
|
||||
site_type = [site_name] if type(site_name) == Site else site_name
|
||||
scraper_input = ScraperInput(
|
||||
site_type=site_type,
|
||||
search_term=search_term,
|
||||
location=location,
|
||||
distance=distance,
|
||||
is_remote=is_remote,
|
||||
job_type=job_type,
|
||||
easy_apply=easy_apply,
|
||||
results_wanted=results_wanted,
|
||||
output_format=output_format
|
||||
)
|
||||
|
||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||
scraper_class = SCRAPER_MAPPING[site]
|
||||
scraper = scraper_class()
|
||||
|
@ -41,28 +68,19 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
|||
|
||||
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
||||
scraper_response = CommonResponse(status="JSON response success", **results)
|
||||
|
||||
if scraper_input.output_format == OutputFormat.CSV:
|
||||
csv_output = CSVFormatter.format(scraper_response)
|
||||
response = StreamingResponse(csv_output, media_type="text/csv")
|
||||
response.headers[
|
||||
"Content-Disposition"
|
||||
] = f"attachment; filename={CSVFormatter.generate_filename()}"
|
||||
return response
|
||||
df = pd.DataFrame()
|
||||
|
||||
elif scraper_input.output_format == OutputFormat.GSHEET:
|
||||
csv_output = CSVFormatter.format(scraper_response)
|
||||
try:
|
||||
CSVFormatter.upload_to_google_sheet(csv_output)
|
||||
return CommonResponse(
|
||||
status="Successfully uploaded to Google Sheets", **results
|
||||
)
|
||||
for site in results:
|
||||
for job in results[site].jobs:
|
||||
data = job.json()
|
||||
|
||||
data_df = pd.read_json(data, typ='series')
|
||||
data_df['site'] = site
|
||||
|
||||
#: concat
|
||||
df = pd.concat([df, data_df], axis=1)
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
return CommonResponse(
|
||||
status="Failed to upload to Google Sheet", error=repr(e), **results
|
||||
)
|
||||
|
||||
else:
|
||||
return scraper_response
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from ..jobs import *
|
||||
from ..jobs import Enum, BaseModel, JobType, JobResponse
|
||||
from ..formatters import OutputFormat
|
||||
from typing import List, Dict, Optional, Any
|
||||
|
||||
|
|
|
@ -51,16 +51,18 @@ class ZipRecruiterScraper(Scraper):
|
|||
params = {
|
||||
"search": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
"radius": scraper_input.distance,
|
||||
"refine_by_location_type": "only_remote"
|
||||
if scraper_input.is_remote
|
||||
else None,
|
||||
"refine_by_employment": f"employment_type:employment_type:{job_type_value}"
|
||||
if job_type_value
|
||||
else None,
|
||||
"page": page,
|
||||
}
|
||||
|
||||
if scraper_input.is_remote:
|
||||
params["refine_by_location_type"] = "only_remote"
|
||||
|
||||
if scraper_input.distance:
|
||||
params["radius"] = scraper_input.distance
|
||||
|
||||
if job_type_value:
|
||||
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
|
||||
|
||||
response = session.get(
|
||||
self.url + "/jobs-search",
|
||||
headers=ZipRecruiterScraper.headers(),
|
||||
|
@ -70,7 +72,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
if response.status_code != status.HTTP_200_OK:
|
||||
raise StatusException(response.status_code)
|
||||
|
||||
html_string = response.content
|
||||
html_string = response.text
|
||||
soup = BeautifulSoup(html_string, "html.parser")
|
||||
|
||||
if page == 1:
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
from src import scrape_jobs
|
||||
|
||||
|
||||
def test_indeed():
|
||||
result = scrape_jobs(
|
||||
site_name="indeed",
|
||||
search_term="software engineer",
|
||||
)
|
||||
|
||||
assert result is not None
|
|
@ -0,0 +1,10 @@
|
|||
from src import scrape_jobs
|
||||
|
||||
|
||||
def test_ziprecruiter():
|
||||
result = scrape_jobs(
|
||||
site_name="zip_recruiter",
|
||||
search_term="software engineer",
|
||||
)
|
||||
|
||||
assert result is not None
|
Loading…
Reference in New Issue