- zip_recruiter parallel job search

pull/12/head
zacharyhampton 2023-07-11 10:49:36 -05:00
parent e86acba9f8
commit a0425ef480
1 changed files with 119 additions and 95 deletions

View File

@ -5,11 +5,17 @@ from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
from fastapi import status from fastapi import status
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, Future
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import * from api.core.jobs import *
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
def __init__(self): def __init__(self):
""" """
@ -19,22 +25,20 @@ class ZipRecruiterScraper(Scraper):
super().__init__(site) super().__init__(site)
self.url = "https://www.ziprecruiter.com/jobs-search" self.url = "https://www.ziprecruiter.com/jobs-search"
self.jobs_per_page = 20
self.seen_urls = set()
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]:
""" """
Scrapes ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:return: job_response :param page:
:param session:
:return:
""" """
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
job_list: list[JobPost] = [] job_list = []
page = 1
processed_jobs, job_count = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
job_type_value = None job_type_value = None
if scraper_input.job_type: if scraper_input.job_type:
if scraper_input.job_type.value == "fulltime": if scraper_input.job_type.value == "fulltime":
@ -60,12 +64,9 @@ class ZipRecruiterScraper(Scraper):
response = session.get( response = session.get(
self.url, headers=ZipRecruiterScraper.headers(), params=params self.url, headers=ZipRecruiterScraper.headers(), params=params
) )
print(response.url)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
return JobResponse( raise StatusException(response.status_code)
success=False,
error=f"Response returned {response.status_code}",
)
html_string = response.content html_string = response.content
soup = BeautifulSoup(html_string, "html.parser") soup = BeautifulSoup(html_string, "html.parser")
@ -73,16 +74,15 @@ class ZipRecruiterScraper(Scraper):
script_tag = soup.find("script", {"id": "js_variables"}) script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string) data = json.loads(script_tag.string)
job_count = data["totalJobCount"] #: job_count = int(data["totalJobCount"].replace(",", ""))
job_count = int(job_count.replace(",", ""))
job_posts = soup.find_all("div", {"class": "job_content"}) job_posts = soup.find_all("div", {"class": "job_content"})
for job in job_posts: for job in job_posts:
processed_jobs += 1
job_url = job.find("a", {"class": "job_link"})["href"] job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in seen_urls: if job_url in self.seen_urls:
continue continue
title = job.find("h2", {"class": "title"}).text title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip() company = job.find("a", {"class": "company_name"}).text.strip()
description = job.find("p", {"class": "job_snippet"}).text.strip() description = job.find("p", {"class": "job_snippet"}).text.strip()
@ -114,25 +114,49 @@ class ZipRecruiterScraper(Scraper):
job_url=job_url, job_url=job_url,
) )
job_list.append(job_post) job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
if ( return job_list
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
page += 1 def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = scraper_input.results_wanted // self.jobs_per_page
job_list = job_list[: scraper_input.results_wanted] try:
#: get first page to initialize session
job_list = self.scrape_page(scraper_input, 1, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(
self.scrape_page, scraper_input, page, session
) for page in range(2, pages_to_process + 1)
]
for future in futures:
result = future.result()
job_list += result
except StatusException as e:
return JobResponse(
success=False,
error=f"ZipRecruiter returned status code {e.status_code}",
)
#: note: this does not handle if the results are more or less than the results_wanted
#: job_list = job_list[:scraper_input.results_wanted]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=job_count, job_count=len(job_list),
) )
return job_response return job_response