mirror of https://github.com/Bunsly/JobSpy
- full change
parent
9a86d2b1f5
commit
0fa92544c9
|
@ -1,6 +1,3 @@
|
||||||
import io
|
|
||||||
from fastapi import APIRouter
|
|
||||||
from fastapi.responses import StreamingResponse
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from .core.scrapers.indeed import IndeedScraper
|
from .core.scrapers.indeed import IndeedScraper
|
||||||
|
@ -14,9 +11,10 @@ from .core.scrapers import (
|
||||||
OutputFormat,
|
OutputFormat,
|
||||||
CommonResponse,
|
CommonResponse,
|
||||||
)
|
)
|
||||||
from typing import List, Dict, Tuple, Union
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
import pandas as pd
|
||||||
|
from .core.jobs import JobType
|
||||||
|
from typing import List, Dict, Tuple, Union
|
||||||
|
|
||||||
SCRAPER_MAPPING = {
|
SCRAPER_MAPPING = {
|
||||||
Site.LINKEDIN: LinkedInScraper,
|
Site.LINKEDIN: LinkedInScraper,
|
||||||
|
@ -24,15 +22,44 @@ SCRAPER_MAPPING = {
|
||||||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _map_str_to_site(site_name: str) -> Site:
|
||||||
|
return Site[site_name.upper()]
|
||||||
|
|
||||||
@router.post("/")
|
|
||||||
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
def scrape_jobs(
|
||||||
|
site_name: str | Site | List[Site],
|
||||||
|
search_term: str,
|
||||||
|
|
||||||
|
output_format: OutputFormat = OutputFormat.JSON,
|
||||||
|
location: str = "",
|
||||||
|
distance: int = None,
|
||||||
|
is_remote: bool = False,
|
||||||
|
job_type: JobType = None,
|
||||||
|
easy_apply: bool = False, # linkedin
|
||||||
|
results_wanted: int = 15
|
||||||
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrapes job data from multiple job sites.
|
Asynchronously scrapes job data from multiple job sites.
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
:return: scraper_response
|
:return: scraper_response
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if type(site_name) == str:
|
||||||
|
site_name = _map_str_to_site(site_name)
|
||||||
|
|
||||||
|
site_type = [site_name] if type(site_name) == Site else site_name
|
||||||
|
scraper_input = ScraperInput(
|
||||||
|
site_type=site_type,
|
||||||
|
search_term=search_term,
|
||||||
|
location=location,
|
||||||
|
distance=distance,
|
||||||
|
is_remote=is_remote,
|
||||||
|
job_type=job_type,
|
||||||
|
easy_apply=easy_apply,
|
||||||
|
results_wanted=results_wanted,
|
||||||
|
output_format=output_format
|
||||||
|
)
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
|
@ -41,28 +68,19 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=3) as executor:
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||||
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
||||||
scraper_response = CommonResponse(status="JSON response success", **results)
|
|
||||||
|
|
||||||
if scraper_input.output_format == OutputFormat.CSV:
|
df = pd.DataFrame()
|
||||||
csv_output = CSVFormatter.format(scraper_response)
|
|
||||||
response = StreamingResponse(csv_output, media_type="text/csv")
|
|
||||||
response.headers[
|
|
||||||
"Content-Disposition"
|
|
||||||
] = f"attachment; filename={CSVFormatter.generate_filename()}"
|
|
||||||
return response
|
|
||||||
|
|
||||||
elif scraper_input.output_format == OutputFormat.GSHEET:
|
for site in results:
|
||||||
csv_output = CSVFormatter.format(scraper_response)
|
for job in results[site].jobs:
|
||||||
try:
|
data = job.json()
|
||||||
CSVFormatter.upload_to_google_sheet(csv_output)
|
|
||||||
return CommonResponse(
|
data_df = pd.read_json(data, typ='series')
|
||||||
status="Successfully uploaded to Google Sheets", **results
|
data_df['site'] = site
|
||||||
)
|
|
||||||
|
#: concat
|
||||||
|
df = pd.concat([df, data_df], axis=1)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
return CommonResponse(
|
|
||||||
status="Failed to upload to Google Sheet", error=repr(e), **results
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
|
||||||
return scraper_response
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from ..jobs import *
|
from ..jobs import Enum, BaseModel, JobType, JobResponse
|
||||||
from ..formatters import OutputFormat
|
from ..formatters import OutputFormat
|
||||||
from typing import List, Dict, Optional, Any
|
from typing import List, Dict, Optional, Any
|
||||||
|
|
||||||
|
|
|
@ -51,16 +51,18 @@ class ZipRecruiterScraper(Scraper):
|
||||||
params = {
|
params = {
|
||||||
"search": scraper_input.search_term,
|
"search": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"radius": scraper_input.distance,
|
|
||||||
"refine_by_location_type": "only_remote"
|
|
||||||
if scraper_input.is_remote
|
|
||||||
else None,
|
|
||||||
"refine_by_employment": f"employment_type:employment_type:{job_type_value}"
|
|
||||||
if job_type_value
|
|
||||||
else None,
|
|
||||||
"page": page,
|
"page": page,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if scraper_input.is_remote:
|
||||||
|
params["refine_by_location_type"] = "only_remote"
|
||||||
|
|
||||||
|
if scraper_input.distance:
|
||||||
|
params["radius"] = scraper_input.distance
|
||||||
|
|
||||||
|
if job_type_value:
|
||||||
|
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
|
||||||
|
|
||||||
response = session.get(
|
response = session.get(
|
||||||
self.url + "/jobs-search",
|
self.url + "/jobs-search",
|
||||||
headers=ZipRecruiterScraper.headers(),
|
headers=ZipRecruiterScraper.headers(),
|
||||||
|
@ -70,7 +72,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
if response.status_code != status.HTTP_200_OK:
|
if response.status_code != status.HTTP_200_OK:
|
||||||
raise StatusException(response.status_code)
|
raise StatusException(response.status_code)
|
||||||
|
|
||||||
html_string = response.content
|
html_string = response.text
|
||||||
soup = BeautifulSoup(html_string, "html.parser")
|
soup = BeautifulSoup(html_string, "html.parser")
|
||||||
|
|
||||||
if page == 1:
|
if page == 1:
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
from src import scrape_jobs
|
||||||
|
|
||||||
|
|
||||||
|
def test_indeed():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="indeed",
|
||||||
|
search_term="software engineer",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None
|
|
@ -0,0 +1,10 @@
|
||||||
|
from src import scrape_jobs
|
||||||
|
|
||||||
|
|
||||||
|
def test_ziprecruiter():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="zip_recruiter",
|
||||||
|
search_term="software engineer",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None
|
Loading…
Reference in New Issue