Add pagination offset support for API queries

Implements offset parameter to enable pagination within the 10k API limit. Users can now fetch results in chunks (e.g., offset=200, limit=200 for results 200-399). Includes validation to ensure offset + limit doesn't exceed API maximum. Also fixes multi-page result sorting to preserve correct order across page boundaries.

Fixes #139

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Zachary Hampton
2025-11-05 10:57:01 -08:00
parent 1608020b69
commit 21b6ba44f4
7 changed files with 195 additions and 33 deletions

View File

@@ -278,7 +278,9 @@ Optional
├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending' ├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending'
── limit (integer): Limit the number of properties to fetch. Max & default is 10000. ── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
└── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
``` ```
### Property Schema ### Property Schema

View File

@@ -1,7 +1,7 @@
import warnings import warnings
import pandas as pd import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_datetime, validate_filters, validate_sort from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_offset, validate_datetime, validate_filters, validate_sort
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
from typing import Union, Optional, List from typing import Union, Optional, List
@@ -21,6 +21,7 @@ def scrape_property(
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False, exclude_pending: bool = False,
limit: int = 10000, limit: int = 10000,
offset: int = 0,
# New date/time filtering parameters # New date/time filtering parameters
past_hours: int = None, past_hours: int = None,
datetime_from: str = None, datetime_from: str = None,
@@ -61,6 +62,7 @@ def scrape_property(
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending. :param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
:param limit: Limit the number of results returned. Maximum is 10,000. :param limit: Limit the number of results returned. Maximum is 10,000.
:param offset: Starting position for pagination within the 10k limit (offset + limit cannot exceed 10,000). Use with limit to fetch results in chunks (e.g., offset=200, limit=200 fetches results 200-399). Should be a multiple of 200 (page size) for optimal performance. Default is 0. Note: Cannot be used to bypass the 10k API limit - use date ranges (date_from/date_to) to narrow searches and fetch more data.
New parameters: New parameters:
:param past_hours: Get properties in the last _ hours (requires client-side filtering) :param past_hours: Get properties in the last _ hours (requires client-side filtering)
@@ -77,6 +79,7 @@ def scrape_property(
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
validate_limit(limit) validate_limit(limit)
validate_offset(offset, limit)
validate_datetime(datetime_from) validate_datetime(datetime_from)
validate_datetime(datetime_to) validate_datetime(datetime_to)
validate_filters( validate_filters(
@@ -100,6 +103,7 @@ def scrape_property(
extra_property_data=extra_property_data, extra_property_data=extra_property_data,
exclude_pending=exclude_pending, exclude_pending=exclude_pending,
limit=limit, limit=limit,
offset=offset,
# New date/time filtering # New date/time filtering
past_hours=past_hours, past_hours=past_hours,
datetime_from=datetime_from, datetime_from=datetime_from,

View File

@@ -25,6 +25,7 @@ class ScraperInput(BaseModel):
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False exclude_pending: bool | None = False
limit: int = 10000 limit: int = 10000
offset: int = 0
return_type: ReturnType = ReturnType.pandas return_type: ReturnType = ReturnType.pandas
# New date/time filtering parameters # New date/time filtering parameters
@@ -106,6 +107,7 @@ class Scraper:
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit self.limit = scraper_input.limit
self.offset = scraper_input.offset
self.return_type = scraper_input.return_type self.return_type = scraper_input.return_type
# New date/time filtering # New date/time filtering

View File

@@ -405,13 +405,23 @@ class RealtorScraper(Scraper):
if self.return_type != ReturnType.raw: if self.return_type != ReturnType.raw:
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [executor.submit(process_property, result, self.mls_only, self.extra_property_data, # Store futures with their indices to maintain sort order
self.exclude_pending, self.listing_type, get_key, process_extra_property_details) for result in properties_list] futures_with_indices = [
(i, executor.submit(process_property, result, self.mls_only, self.extra_property_data,
self.exclude_pending, self.listing_type, get_key, process_extra_property_details))
for i, result in enumerate(properties_list)
]
for future in as_completed(futures): # Collect results and sort by index to preserve API sort order
results = []
for idx, future in futures_with_indices:
result = future.result() result = future.result()
if result: if result:
properties.append(result) results.append((idx, result))
# Sort by index and extract properties in correct order
results.sort(key=lambda x: x[0])
properties = [result for idx, result in results]
else: else:
properties = properties_list properties = properties_list
@@ -428,7 +438,7 @@ class RealtorScraper(Scraper):
location_type = location_info["area_type"] location_type = location_info["area_type"]
search_variables = { search_variables = {
"offset": 0, "offset": self.offset,
} }
search_type = ( search_type = (
@@ -473,21 +483,30 @@ class RealtorScraper(Scraper):
homes = result["properties"] homes = result["properties"]
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
futures = [ # Store futures with their offsets to maintain proper sort order
executor.submit( # Start from offset + page_size and go up to offset + limit
futures_with_offsets = [
(i, executor.submit(
self.general_search, self.general_search,
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
search_type=search_type, search_type=search_type,
) ))
for i in range( for i in range(
self.DEFAULT_PAGE_SIZE, self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.limit), min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE, self.DEFAULT_PAGE_SIZE,
) )
] ]
for future in as_completed(futures): # Collect results and sort by offset to preserve API sort order across pages
homes.extend(future.result()["properties"]) results = []
for offset, future in futures_with_offsets:
results.append((offset, future.result()["properties"]))
# Sort by offset and concatenate in correct order
results.sort(key=lambda x: x[0])
for offset, properties in results:
homes.extend(properties)
# Apply client-side hour-based filtering if needed # Apply client-side hour-based filtering if needed
# (API only supports day-level filtering, so we post-filter for hour precision) # (API only supports day-level filtering, so we post-filter for hour precision)
@@ -498,6 +517,11 @@ class RealtorScraper(Scraper):
elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from): elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
homes = self._apply_pending_date_filter(homes) homes = self._apply_pending_date_filter(homes)
# Apply client-side sort to ensure results are properly ordered
# This is necessary after filtering and to guarantee sort order across page boundaries
if self.sort_by:
homes = self._apply_sort(homes)
return homes return homes
def _apply_hour_based_date_filter(self, homes): def _apply_hour_based_date_filter(self, homes):
@@ -722,6 +746,60 @@ class RealtorScraper(Scraper):
return date_range['from_date'] <= date_obj <= date_range['to_date'] return date_range['from_date'] <= date_obj <= date_range['to_date']
return False return False
def _apply_sort(self, homes):
"""Apply client-side sorting to ensure results are properly ordered.
This is necessary because:
1. Multi-page results need to be re-sorted after concatenation
2. Filtering operations may disrupt the original sort order
Args:
homes: List of properties (either dicts or Property objects)
Returns:
Sorted list of properties
"""
if not homes or not self.sort_by:
return homes
def get_sort_key(home):
"""Extract the sort field value from a home (handles both dict and Property object)."""
if isinstance(home, dict):
value = home.get(self.sort_by)
else:
# Property object
value = getattr(home, self.sort_by, None)
# Handle None values - push them to the end
if value is None:
# Use a sentinel value that sorts to the end
return (1, 0) if self.sort_direction == "desc" else (1, float('inf'))
# For datetime fields, convert string to datetime for proper sorting
if self.sort_by in ['list_date', 'sold_date', 'pending_date']:
if isinstance(value, str):
try:
from datetime import datetime
# Handle timezone indicators
date_value = value
if date_value.endswith('Z'):
date_value = date_value[:-1] + '+00:00'
parsed_date = datetime.fromisoformat(date_value)
return (0, parsed_date)
except (ValueError, AttributeError):
# If parsing fails, treat as None
return (1, 0) if self.sort_direction == "desc" else (1, float('inf'))
return (0, value)
# For numeric fields, ensure we can compare
return (0, value)
# Sort the homes
reverse = (self.sort_direction == "desc")
sorted_homes = sorted(homes, key=get_sort_key, reverse=reverse)
return sorted_homes
@retry( @retry(

View File

@@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import pandas as pd import pandas as pd
import warnings
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType, Advertisers from .core.scrapers.models import Property, ListingType, Advertisers
from .exceptions import InvalidListingType, InvalidDate from .exceptions import InvalidListingType, InvalidDate
@@ -182,6 +183,36 @@ def validate_limit(limit: int) -> None:
raise ValueError("Property limit must be between 1 and 10,000.") raise ValueError("Property limit must be between 1 and 10,000.")
def validate_offset(offset: int, limit: int = 10000) -> None:
"""Validate offset parameter for pagination.
Args:
offset: Starting position for results pagination
limit: Maximum number of results to fetch
Raises:
ValueError: If offset is invalid or if offset + limit exceeds API limit
"""
if offset is not None and offset < 0:
raise ValueError("Offset must be non-negative (>= 0).")
# Check if offset + limit exceeds API's hard limit of 10,000
if offset is not None and limit is not None and (offset + limit) > 10000:
raise ValueError(
f"offset ({offset}) + limit ({limit}) = {offset + limit} exceeds API maximum of 10,000. "
f"The API cannot return results beyond position 10,000. "
f"To fetch more results, narrow your search."
)
# Warn if offset is not a multiple of 200 (API page size)
if offset is not None and offset > 0 and offset % 200 != 0:
warnings.warn(
f"Offset should be a multiple of 200 (page size) for optimal performance. "
f"Using offset {offset} may result in less efficient pagination.",
UserWarning
)
def validate_datetime(datetime_str: str | None) -> None: def validate_datetime(datetime_str: str | None) -> None:
"""Validate ISO 8601 datetime format.""" """Validate ISO 8601 datetime format."""
if not datetime_str: if not datetime_str:

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.7.1" version = "0.7.2"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@@ -870,66 +870,111 @@ def test_combined_filters():
def test_sorting_by_price(): def test_sorting_by_price():
"""Test sorting by list_price - note API sorting may not be perfect""" """Test sorting by list_price with actual sort order validation"""
# Sort ascending (cheapest first) # Sort ascending (cheapest first) with multi-page limit to test concatenation
result_asc = scrape_property( result_asc = scrape_property(
location="Orlando, FL", location="Orlando, FL",
listing_type="for_sale", listing_type="for_sale",
sort_by="list_price", sort_by="list_price",
sort_direction="asc", sort_direction="asc",
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result_asc is not None and len(result_asc) > 0 assert result_asc is not None and len(result_asc) > 0
# Verify ascending sort order (allow for None/NA values at the end)
prices_asc = result_asc["list_price"].dropna().tolist()
assert len(prices_asc) > 0, "No properties with prices found"
assert prices_asc == sorted(prices_asc), f"Prices not in ascending order: {prices_asc[:10]}"
# Sort descending (most expensive first) # Sort descending (most expensive first)
result_desc = scrape_property( result_desc = scrape_property(
location="San Antonio, TX", location="San Antonio, TX",
listing_type="for_sale", listing_type="for_sale",
sort_by="list_price", sort_by="list_price",
sort_direction="desc", sort_direction="desc",
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result_desc is not None and len(result_desc) > 0 assert result_desc is not None and len(result_desc) > 0
# Note: Realtor API sorting may not be perfectly reliable for all search types # Verify descending sort order (allow for None/NA values at the end)
# The test ensures the sort parameters don't cause errors, actual sort order may vary prices_desc = result_desc["list_price"].dropna().tolist()
assert len(prices_desc) > 0, "No properties with prices found"
assert prices_desc == sorted(prices_desc, reverse=True), f"Prices not in descending order: {prices_desc[:10]}"
def test_sorting_by_date(): def test_sorting_by_date():
"""Test sorting by list_date - note API sorting may not be perfect""" """Test sorting by list_date with actual sort order validation"""
result = scrape_property( # Test descending (newest first) with multi-page limit
result_desc = scrape_property(
location="Columbus, OH", location="Columbus, OH",
listing_type="for_sale", listing_type="for_sale",
sort_by="list_date", sort_by="list_date",
sort_direction="desc", # Newest first sort_direction="desc", # Newest first
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result is not None and len(result) > 0 assert result_desc is not None and len(result_desc) > 0
# Test ensures sort parameter doesn't cause errors # Verify descending sort order (allow for None/NA values at the end)
# Note: Realtor API sorting may not be perfectly reliable for all search types dates_desc = result_desc["list_date"].dropna().tolist()
assert len(dates_desc) > 0, "No properties with dates found"
assert dates_desc == sorted(dates_desc, reverse=True), f"Dates not in descending order (newest first): {dates_desc[:10]}"
# Test ascending (oldest first)
result_asc = scrape_property(
location="Columbus, OH",
listing_type="for_sale",
sort_by="list_date",
sort_direction="asc", # Oldest first
limit=250
)
assert result_asc is not None and len(result_asc) > 0
# Verify ascending sort order
dates_asc = result_asc["list_date"].dropna().tolist()
assert len(dates_asc) > 0, "No properties with dates found"
assert dates_asc == sorted(dates_asc), f"Dates not in ascending order (oldest first): {dates_asc[:10]}"
def test_sorting_by_sqft(): def test_sorting_by_sqft():
"""Test sorting by square footage - note API sorting may not be perfect""" """Test sorting by square footage with actual sort order validation"""
result = scrape_property( # Test descending (largest first) with multi-page limit
result_desc = scrape_property(
location="Indianapolis, IN", location="Indianapolis, IN",
listing_type="for_sale", listing_type="for_sale",
sort_by="sqft", sort_by="sqft",
sort_direction="desc", # Largest first sort_direction="desc", # Largest first
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result is not None and len(result) > 0 assert result_desc is not None and len(result_desc) > 0
# Test ensures sort parameter doesn't cause errors # Verify descending sort order (allow for None/NA values at the end)
# Note: Realtor API sorting may not be perfectly reliable for all search types sqfts_desc = result_desc["sqft"].dropna().tolist()
assert len(sqfts_desc) > 0, "No properties with sqft found"
assert sqfts_desc == sorted(sqfts_desc, reverse=True), f"Square footages not in descending order: {sqfts_desc[:10]}"
# Test ascending (smallest first)
result_asc = scrape_property(
location="Indianapolis, IN",
listing_type="for_sale",
sort_by="sqft",
sort_direction="asc", # Smallest first
limit=250
)
assert result_asc is not None and len(result_asc) > 0
# Verify ascending sort order
sqfts_asc = result_asc["sqft"].dropna().tolist()
assert len(sqfts_asc) > 0, "No properties with sqft found"
assert sqfts_asc == sorted(sqfts_asc), f"Square footages not in ascending order: {sqfts_asc[:10]}"
def test_filter_validation_errors(): def test_filter_validation_errors():