- indeed parallel job search

pull/12/head
zacharyhampton 2023-07-11 11:02:46 -05:00
parent a0425ef480
commit 59f0780831
3 changed files with 135 additions and 120 deletions

View File

@ -1,6 +1,11 @@
from ..jobs import * from ..jobs import *
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class Site(Enum): class Site(Enum):
LINKEDIN = "linkedin" LINKEDIN = "linkedin"
INDEED = "indeed" INDEED = "indeed"

View File

@ -8,7 +8,9 @@ from bs4.element import Tag
from fastapi import status from fastapi import status
from api.core.jobs import * from api.core.jobs import *
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future
class ParsingException(Exception): class ParsingException(Exception):
@ -25,21 +27,12 @@ class IndeedScraper(Scraper):
self.url = "https://www.indeed.com/jobs" self.url = "https://www.indeed.com/jobs"
self.job_url = "https://www.indeed.com/viewjob?jk=" self.job_url = "https://www.indeed.com/viewjob?jk="
def scrape(self, scraper_input: ScraperInput) -> JobResponse: self.jobs_per_page = 15
""" self.seen_urls = set()
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input: def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]:
:return: job_response job_list = []
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
job_list: list[JobPost] = []
page = 0
processed_jobs, total_num_jobs = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@ -61,38 +54,26 @@ class IndeedScraper(Scraper):
response.status_code != status.HTTP_200_OK response.status_code != status.HTTP_200_OK
and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT
): ):
return JobResponse( raise StatusException(response.status_code)
success=False,
error=f"Response returned {response.status_code}",
)
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
try: jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function
jobs = IndeedScraper.parse_jobs(soup)
except ParsingException:
return JobResponse(
success=False,
error="Failed to parse jobs.",
)
total_num_jobs = IndeedScraper.total_jobs(soup) #: total_num_jobs = IndeedScraper.total_jobs(soup) #: for now
if ( if (
not jobs.get("metaData", {}) not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {}) .get("mosaicProviderJobCardsModel", {})
.get("results") .get("results")
): ):
return JobResponse( raise Exception('No jobs found.')
success=False,
error="No jobs found",
)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
processed_jobs += 1
job_url = f'{self.job_url}{job["jobkey"]}' job_url = f'{self.job_url}{job["jobkey"]}'
if job_url in seen_urls: if job_url in self.seen_urls:
continue continue
snippet_html = BeautifulSoup(job["snippet"], "html.parser") snippet_html = BeautifulSoup(job["snippet"], "html.parser")
extracted_salary = job.get("extractedSalary") extracted_salary = job.get("extractedSalary")
@ -136,24 +117,58 @@ class IndeedScraper(Scraper):
job_url=job_url, job_url=job_url,
) )
job_list.append(job_post) job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
if ( return job_list
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
page += 1
job_list = job_list[: scraper_input.results_wanted] def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = scraper_input.results_wanted // self.jobs_per_page
try:
#: get first page to initialize session
job_list = self.scrape_page(scraper_input, 0, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(
self.scrape_page, scraper_input, page, session
) for page in range(1, pages_to_process + 1)
]
for future in futures:
result = future.result()
job_list += result
except StatusException as e:
return JobResponse(
success=False,
error=f"Indeed returned status code {e.status_code}",
)
except ParsingException as e:
return JobResponse(
success=False,
error=f"Indeed failed to parse response: {e}",
)
except Exception as e:
return JobResponse(
success=False,
error=f"Indeed failed to scrape: {e}",
)
#: job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=total_num_jobs, job_count=len(job_list),
) )
return job_response return job_response

View File

@ -7,15 +7,10 @@ from fastapi import status
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from api.core.jobs import * from api.core.jobs import *
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
def __init__(self): def __init__(self):
""" """