Add configurable parallel/sequential pagination with parallel parameter

- Add `parallel: bool = True` parameter to control pagination strategy
- Parallel mode (default): Fetches all pages in parallel for maximum speed
- Sequential mode: Fetches pages one-by-one with early termination checks
- Early termination stops pagination when time-based filters indicate no more matches
- Useful for rate limiting and narrow time windows
- Simplified pagination logic by removing hybrid first-page pre-check
- Updated README with usage example and parameter documentation
- Version bump to 0.8.4
- All 54 tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Zachary Hampton
2025-11-13 10:36:47 -08:00
parent 9b61a89c77
commit c2f01df1ad
5 changed files with 68 additions and 32 deletions

View File

@@ -55,6 +55,9 @@ class ScraperInput(BaseModel):
sort_by: str | None = None
sort_direction: str = "desc"
# Pagination control
parallel: bool = True
class Scraper:
session = None
@@ -141,6 +144,9 @@ class Scraper:
self.sort_by = scraper_input.sort_by
self.sort_direction = scraper_input.sort_direction
# Pagination control
self.parallel = scraper_input.parallel
def search(self) -> list[Union[Property | dict]]: ...
@staticmethod

View File

@@ -526,39 +526,49 @@ class RealtorScraper(Scraper):
total = result["total"]
homes = result["properties"]
# Pre-check: Should we continue pagination?
# This optimization prevents unnecessary API calls when using time-based filters
# with date sorting. If page 1's last property is outside the time window,
# all future pages will also be outside (due to sort order).
should_continue_pagination = self._should_fetch_more_pages(homes)
# Fetch remaining pages based on parallel parameter
if self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
if self.parallel:
# Parallel mode: Fetch all remaining pages in parallel
with ThreadPoolExecutor() as executor:
futures_with_offsets = [
(i, executor.submit(
self.general_search,
variables=search_variables | {"offset": i},
search_type=search_type,
))
for i in range(
self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE,
)
]
# Only launch parallel pagination if needed
if should_continue_pagination and self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
with ThreadPoolExecutor() as executor:
# Store futures with their offsets to maintain proper sort order
# Start from offset + page_size and go up to offset + limit
futures_with_offsets = [
(i, executor.submit(
self.general_search,
variables=search_variables | {"offset": i},
# Collect results and sort by offset to preserve API sort order
results = []
for offset, future in futures_with_offsets:
results.append((offset, future.result()["properties"]))
results.sort(key=lambda x: x[0])
for offset, properties in results:
homes.extend(properties)
else:
# Sequential mode: Fetch pages one by one with early termination checks
for current_offset in range(
self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE,
):
# Check if we should continue based on time-based filters
if not self._should_fetch_more_pages(homes):
break
result = self.general_search(
variables=search_variables | {"offset": current_offset},
search_type=search_type,
))
for i in range(
self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE,
)
]
# Collect results and sort by offset to preserve API sort order across pages
results = []
for offset, future in futures_with_offsets:
results.append((offset, future.result()["properties"]))
# Sort by offset and concatenate in correct order
results.sort(key=lambda x: x[0])
for offset, properties in results:
homes.extend(properties)
page_properties = result["properties"]
homes.extend(page_properties)
# Apply client-side hour-based filtering if needed
# (API only supports day-level filtering, so we post-filter for hour precision)