- total vs returned schema definition

- scraper bug fixes
pull/12/head
zacharyhampton 2023-07-11 12:00:24 -05:00
parent 59f0780831
commit 16ddb9b485
3 changed files with 64 additions and 24 deletions

View File

@ -2,7 +2,7 @@ from typing import Union
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from pydantic import BaseModel from pydantic import BaseModel, validator
class JobType(Enum): class JobType(Enum):
@ -57,5 +57,15 @@ class JobResponse(BaseModel):
success: bool success: bool
error: str = None error: str = None
job_count: int = None
jobs: list[JobPost] = [] jobs: list[JobPost] = []
total_results: int = None
returned_results: int = None
@validator("returned_results")
def set_returned_results(cls, v, values):
if v is None and values.get("jobs"):
return len(values["jobs"])
return v

View File

@ -1,6 +1,6 @@
import re import re
import json import json
from typing import Optional from typing import Optional, Tuple, List
import tls_client import tls_client
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -8,9 +8,11 @@ from bs4.element import Tag
from fastapi import status from fastapi import status
from api.core.jobs import * from api.core.jobs import *
from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
import math
class ParsingException(Exception): class ParsingException(Exception):
@ -30,7 +32,21 @@ class IndeedScraper(Scraper):
self.jobs_per_page = 15 self.jobs_per_page = 15
self.seen_urls = set() self.seen_urls = set()
def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]: def scrape_page(
self,
scraper_input: ScraperInput,
page: int,
session: tls_client.Session
) -> tuple[list[JobPost], int]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input:
:param page:
:param session:
:return: jobs found on page, total number of jobs found for search
"""
job_list = [] job_list = []
params = { params = {
@ -59,8 +75,7 @@ class IndeedScraper(Scraper):
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function jobs = IndeedScraper.parse_jobs(soup) #: can raise exception, handled by main scrape function
total_num_jobs = IndeedScraper.total_jobs(soup)
#: total_num_jobs = IndeedScraper.total_jobs(soup) #: for now
if ( if (
not jobs.get("metaData", {}) not jobs.get("metaData", {})
@ -118,7 +133,7 @@ class IndeedScraper(Scraper):
) )
job_list.append(job_post) job_list.append(job_post)
return job_list return job_list, total_num_jobs
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -130,11 +145,11 @@ class IndeedScraper(Scraper):
client_identifier="chrome112", random_tls_extension_order=True client_identifier="chrome112", random_tls_extension_order=True
) )
pages_to_process = scraper_input.results_wanted // self.jobs_per_page pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
try: try:
#: get first page to initialize session #: get first page to initialize session
job_list = self.scrape_page(scraper_input, 0, session) job_list, total_results = self.scrape_page(scraper_input, 0, session)
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
@ -144,9 +159,9 @@ class IndeedScraper(Scraper):
] ]
for future in futures: for future in futures:
result = future.result() jobs, _ = future.result()
job_list += result job_list += jobs
except StatusException as e: except StatusException as e:
return JobResponse( return JobResponse(
@ -164,11 +179,13 @@ class IndeedScraper(Scraper):
error=f"Indeed failed to scrape: {e}", error=f"Indeed failed to scrape: {e}",
) )
#: job_list = job_list[: scraper_input.results_wanted] if len(job_list) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=len(job_list), total_results=total_results,
) )
return job_response return job_response

View File

@ -1,5 +1,5 @@
import json import json
from typing import Optional from typing import Optional, Tuple, List
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
@ -7,8 +7,10 @@ from fastapi import status
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from api.core.jobs import * from api.core.jobs import *
import math
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@ -23,13 +25,19 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
def scrape_page(self, scraper_input: ScraperInput, page: int, session: tls_client.Session) -> list[JobPost]: def scrape_page(
self,
scraper_input: ScraperInput,
page: int,
session: tls_client.Session
) -> tuple[list[JobPost], int | None]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:param page: :param page:
:param session: :param session:
:return: :return: jobs found on page, total number of jobs found for search
""" """
job_list = [] job_list = []
@ -69,7 +77,9 @@ class ZipRecruiterScraper(Scraper):
script_tag = soup.find("script", {"id": "js_variables"}) script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string) data = json.loads(script_tag.string)
#: job_count = int(data["totalJobCount"].replace(",", "")) job_count = int(data["totalJobCount"].replace(",", ""))
else:
job_count = None
job_posts = soup.find_all("div", {"class": "job_content"}) job_posts = soup.find_all("div", {"class": "job_content"})
@ -110,7 +120,7 @@ class ZipRecruiterScraper(Scraper):
) )
job_list.append(job_post) job_list.append(job_post)
return job_list return job_list, job_count
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -121,11 +131,12 @@ class ZipRecruiterScraper(Scraper):
session = tls_client.Session( session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True client_identifier="chrome112", random_tls_extension_order=True
) )
pages_to_process = scraper_input.results_wanted // self.jobs_per_page
pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
try: try:
#: get first page to initialize session #: get first page to initialize session
job_list = self.scrape_page(scraper_input, 1, session) job_list, total_results = self.scrape_page(scraper_input, 1, session)
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
@ -135,9 +146,9 @@ class ZipRecruiterScraper(Scraper):
] ]
for future in futures: for future in futures:
result = future.result() jobs, _ = future.result()
job_list += result job_list += jobs
except StatusException as e: except StatusException as e:
return JobResponse( return JobResponse(
@ -147,11 +158,13 @@ class ZipRecruiterScraper(Scraper):
#: note: this does not handle if the results are more or less than the results_wanted #: note: this does not handle if the results are more or less than the results_wanted
#: job_list = job_list[:scraper_input.results_wanted] if len(job_list) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=len(job_list), total_results=total_results,
) )
return job_response return job_response