- indeed parallel job search

pull/12/head
zacharyhampton 2023-07-11 11:02:46 -05:00
parent a0425ef480
commit 59f0780831
3 changed files with 135 additions and 120 deletions

View File

@ -1,6 +1,11 @@
from ..jobs import *
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"

View File

@ -8,7 +8,9 @@ from bs4.element import Tag
from fastapi import status
from api.core.jobs import *
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future
class ParsingException(Exception):
@ -25,21 +27,12 @@ class IndeedScraper(Scraper):
self.url = "https://www.indeed.com/jobs"
self.job_url = "https://www.indeed.com/viewjob?jk="
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
self.jobs_per_page = 15
self.seen_urls = set()
def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]:
job_list = []
job_list: list[JobPost] = []
page = 0
processed_jobs, total_num_jobs = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
params = {
"q": scraper_input.search_term,
"location": scraper_input.location,
@ -61,38 +54,26 @@ class IndeedScraper(Scraper):
response.status_code != status.HTTP_200_OK
and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT
):
return JobResponse(
success=False,
error=f"Response returned {response.status_code}",
)
raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")
try:
jobs = IndeedScraper.parse_jobs(soup)
except ParsingException:
return JobResponse(
success=False,
error="Failed to parse jobs.",
)
jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function
total_num_jobs = IndeedScraper.total_jobs(soup)
#: total_num_jobs = IndeedScraper.total_jobs(soup) #: for now
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
return JobResponse(
success=False,
error="No jobs found",
)
raise Exception('No jobs found.')
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
processed_jobs += 1
job_url = f'{self.job_url}{job["jobkey"]}'
if job_url in seen_urls:
if job_url in self.seen_urls:
continue
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
extracted_salary = job.get("extractedSalary")
@ -136,24 +117,58 @@ class IndeedScraper(Scraper):
job_url=job_url,
)
job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
page += 1
return job_list
job_list = job_list[: scraper_input.results_wanted]
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = scraper_input.results_wanted // self.jobs_per_page
try:
#: get first page to initialize session
job_list = self.scrape_page(scraper_input, 0, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(
self.scrape_page, scraper_input, page, session
) for page in range(1, pages_to_process + 1)
]
for future in futures:
result = future.result()
job_list += result
except StatusException as e:
return JobResponse(
success=False,
error=f"Indeed returned status code {e.status_code}",
)
except ParsingException as e:
return JobResponse(
success=False,
error=f"Indeed failed to parse response: {e}",
)
except Exception as e:
return JobResponse(
success=False,
error=f"Indeed failed to scrape: {e}",
)
#: job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse(
success=True,
jobs=job_list,
job_count=total_num_jobs,
job_count=len(job_list),
)
return job_response

View File

@ -7,15 +7,10 @@ from fastapi import status
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, Future
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from api.core.jobs import *
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class ZipRecruiterScraper(Scraper):
def __init__(self):
"""