Compare commits

...

2 Commits

Author SHA1 Message Date
Zachary Hampton
4e6e144617 Fix exclude_pending and mls_only filters not working with raw return type
When return_type="raw" was specified, the exclude_pending and mls_only
parameters were ignored because these filters only existed in
process_property(), which is bypassed for raw data returns.

Changes:
- Added _apply_raw_data_filters() method to handle client-side filtering
  for raw data
- Applied the filter in search() method after sorting but before returning
- Fixed exclude_pending to check flags.is_pending and flags.is_contingent
- Fixed mls_only to check source.id (not mls.id which doesn't exist in raw data)
- Added comprehensive tests for both filters with raw data

Fixes #140

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 11:21:28 -08:00
Zachary Hampton
21b6ba44f4 Add pagination offset support for API queries
Implements offset parameter to enable pagination within the 10k API limit. Users can now fetch results in chunks (e.g., offset=200, limit=200 for results 200-399). Includes validation to ensure offset + limit doesn't exceed API maximum. Also fixes multi-page result sorting to preserve correct order across page boundaries.

Fixes #139

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 10:57:01 -08:00
7 changed files with 322 additions and 34 deletions

View File

@@ -278,7 +278,9 @@ Optional
├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending' ├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending'
── limit (integer): Limit the number of properties to fetch. Max & default is 10000. ── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
└── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
``` ```
### Property Schema ### Property Schema

View File

@@ -1,7 +1,7 @@
import warnings import warnings
import pandas as pd import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_datetime, validate_filters, validate_sort from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_offset, validate_datetime, validate_filters, validate_sort
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
from typing import Union, Optional, List from typing import Union, Optional, List
@@ -21,6 +21,7 @@ def scrape_property(
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False, exclude_pending: bool = False,
limit: int = 10000, limit: int = 10000,
offset: int = 0,
# New date/time filtering parameters # New date/time filtering parameters
past_hours: int = None, past_hours: int = None,
datetime_from: str = None, datetime_from: str = None,
@@ -61,6 +62,7 @@ def scrape_property(
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending. :param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
:param limit: Limit the number of results returned. Maximum is 10,000. :param limit: Limit the number of results returned. Maximum is 10,000.
:param offset: Starting position for pagination within the 10k limit (offset + limit cannot exceed 10,000). Use with limit to fetch results in chunks (e.g., offset=200, limit=200 fetches results 200-399). Should be a multiple of 200 (page size) for optimal performance. Default is 0. Note: Cannot be used to bypass the 10k API limit - use date ranges (date_from/date_to) to narrow searches and fetch more data.
New parameters: New parameters:
:param past_hours: Get properties in the last _ hours (requires client-side filtering) :param past_hours: Get properties in the last _ hours (requires client-side filtering)
@@ -77,6 +79,7 @@ def scrape_property(
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
validate_limit(limit) validate_limit(limit)
validate_offset(offset, limit)
validate_datetime(datetime_from) validate_datetime(datetime_from)
validate_datetime(datetime_to) validate_datetime(datetime_to)
validate_filters( validate_filters(
@@ -100,6 +103,7 @@ def scrape_property(
extra_property_data=extra_property_data, extra_property_data=extra_property_data,
exclude_pending=exclude_pending, exclude_pending=exclude_pending,
limit=limit, limit=limit,
offset=offset,
# New date/time filtering # New date/time filtering
past_hours=past_hours, past_hours=past_hours,
datetime_from=datetime_from, datetime_from=datetime_from,

View File

@@ -25,6 +25,7 @@ class ScraperInput(BaseModel):
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False exclude_pending: bool | None = False
limit: int = 10000 limit: int = 10000
offset: int = 0
return_type: ReturnType = ReturnType.pandas return_type: ReturnType = ReturnType.pandas
# New date/time filtering parameters # New date/time filtering parameters
@@ -106,6 +107,7 @@ class Scraper:
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit self.limit = scraper_input.limit
self.offset = scraper_input.offset
self.return_type = scraper_input.return_type self.return_type = scraper_input.return_type
# New date/time filtering # New date/time filtering

View File

@@ -405,13 +405,23 @@ class RealtorScraper(Scraper):
if self.return_type != ReturnType.raw: if self.return_type != ReturnType.raw:
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [executor.submit(process_property, result, self.mls_only, self.extra_property_data, # Store futures with their indices to maintain sort order
self.exclude_pending, self.listing_type, get_key, process_extra_property_details) for result in properties_list] futures_with_indices = [
(i, executor.submit(process_property, result, self.mls_only, self.extra_property_data,
self.exclude_pending, self.listing_type, get_key, process_extra_property_details))
for i, result in enumerate(properties_list)
]
for future in as_completed(futures): # Collect results and sort by index to preserve API sort order
results = []
for idx, future in futures_with_indices:
result = future.result() result = future.result()
if result: if result:
properties.append(result) results.append((idx, result))
# Sort by index and extract properties in correct order
results.sort(key=lambda x: x[0])
properties = [result for idx, result in results]
else: else:
properties = properties_list properties = properties_list
@@ -428,7 +438,7 @@ class RealtorScraper(Scraper):
location_type = location_info["area_type"] location_type = location_info["area_type"]
search_variables = { search_variables = {
"offset": 0, "offset": self.offset,
} }
search_type = ( search_type = (
@@ -473,21 +483,30 @@ class RealtorScraper(Scraper):
homes = result["properties"] homes = result["properties"]
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
futures = [ # Store futures with their offsets to maintain proper sort order
executor.submit( # Start from offset + page_size and go up to offset + limit
futures_with_offsets = [
(i, executor.submit(
self.general_search, self.general_search,
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
search_type=search_type, search_type=search_type,
) ))
for i in range( for i in range(
self.DEFAULT_PAGE_SIZE, self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.limit), min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE, self.DEFAULT_PAGE_SIZE,
) )
] ]
for future in as_completed(futures): # Collect results and sort by offset to preserve API sort order across pages
homes.extend(future.result()["properties"]) results = []
for offset, future in futures_with_offsets:
results.append((offset, future.result()["properties"]))
# Sort by offset and concatenate in correct order
results.sort(key=lambda x: x[0])
for offset, properties in results:
homes.extend(properties)
# Apply client-side hour-based filtering if needed # Apply client-side hour-based filtering if needed
# (API only supports day-level filtering, so we post-filter for hour precision) # (API only supports day-level filtering, so we post-filter for hour precision)
@@ -498,6 +517,16 @@ class RealtorScraper(Scraper):
elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from): elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
homes = self._apply_pending_date_filter(homes) homes = self._apply_pending_date_filter(homes)
# Apply client-side sort to ensure results are properly ordered
# This is necessary after filtering and to guarantee sort order across page boundaries
if self.sort_by:
homes = self._apply_sort(homes)
# Apply raw data filters (exclude_pending and mls_only) for raw return type
# These filters are normally applied in process_property() but are bypassed for raw data
if self.return_type == ReturnType.raw:
homes = self._apply_raw_data_filters(homes)
return homes return homes
def _apply_hour_based_date_filter(self, homes): def _apply_hour_based_date_filter(self, homes):
@@ -722,6 +751,101 @@ class RealtorScraper(Scraper):
return date_range['from_date'] <= date_obj <= date_range['to_date'] return date_range['from_date'] <= date_obj <= date_range['to_date']
return False return False
def _apply_sort(self, homes):
"""Apply client-side sorting to ensure results are properly ordered.
This is necessary because:
1. Multi-page results need to be re-sorted after concatenation
2. Filtering operations may disrupt the original sort order
Args:
homes: List of properties (either dicts or Property objects)
Returns:
Sorted list of properties
"""
if not homes or not self.sort_by:
return homes
def get_sort_key(home):
"""Extract the sort field value from a home (handles both dict and Property object)."""
if isinstance(home, dict):
value = home.get(self.sort_by)
else:
# Property object
value = getattr(home, self.sort_by, None)
# Handle None values - push them to the end
if value is None:
# Use a sentinel value that sorts to the end
return (1, 0) if self.sort_direction == "desc" else (1, float('inf'))
# For datetime fields, convert string to datetime for proper sorting
if self.sort_by in ['list_date', 'sold_date', 'pending_date']:
if isinstance(value, str):
try:
from datetime import datetime
# Handle timezone indicators
date_value = value
if date_value.endswith('Z'):
date_value = date_value[:-1] + '+00:00'
parsed_date = datetime.fromisoformat(date_value)
return (0, parsed_date)
except (ValueError, AttributeError):
# If parsing fails, treat as None
return (1, 0) if self.sort_direction == "desc" else (1, float('inf'))
return (0, value)
# For numeric fields, ensure we can compare
return (0, value)
# Sort the homes
reverse = (self.sort_direction == "desc")
sorted_homes = sorted(homes, key=get_sort_key, reverse=reverse)
return sorted_homes
def _apply_raw_data_filters(self, homes):
"""Apply exclude_pending and mls_only filters for raw data returns.
These filters are normally applied in process_property(), but that function
is bypassed when return_type="raw", so we need to apply them here instead.
Args:
homes: List of properties (either dicts or Property objects)
Returns:
Filtered list of properties
"""
if not homes:
return homes
# Only filter raw data (dict objects)
# Property objects have already been filtered in process_property()
if homes and not isinstance(homes[0], dict):
return homes
filtered_homes = []
for home in homes:
# Apply exclude_pending filter
if self.exclude_pending and self.listing_type != ListingType.PENDING:
flags = home.get('flags', {})
is_pending = flags.get('is_pending', False)
is_contingent = flags.get('is_contingent', False)
if is_pending or is_contingent:
continue # Skip this property
# Apply mls_only filter
if self.mls_only:
source = home.get('source', {})
if not source or not source.get('id'):
continue # Skip this property
filtered_homes.append(home)
return filtered_homes
@retry( @retry(

View File

@@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import pandas as pd import pandas as pd
import warnings
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType, Advertisers from .core.scrapers.models import Property, ListingType, Advertisers
from .exceptions import InvalidListingType, InvalidDate from .exceptions import InvalidListingType, InvalidDate
@@ -182,6 +183,36 @@ def validate_limit(limit: int) -> None:
raise ValueError("Property limit must be between 1 and 10,000.") raise ValueError("Property limit must be between 1 and 10,000.")
def validate_offset(offset: int, limit: int = 10000) -> None:
"""Validate offset parameter for pagination.
Args:
offset: Starting position for results pagination
limit: Maximum number of results to fetch
Raises:
ValueError: If offset is invalid or if offset + limit exceeds API limit
"""
if offset is not None and offset < 0:
raise ValueError("Offset must be non-negative (>= 0).")
# Check if offset + limit exceeds API's hard limit of 10,000
if offset is not None and limit is not None and (offset + limit) > 10000:
raise ValueError(
f"offset ({offset}) + limit ({limit}) = {offset + limit} exceeds API maximum of 10,000. "
f"The API cannot return results beyond position 10,000. "
f"To fetch more results, narrow your search."
)
# Warn if offset is not a multiple of 200 (API page size)
if offset is not None and offset > 0 and offset % 200 != 0:
warnings.warn(
f"Offset should be a multiple of 200 (page size) for optimal performance. "
f"Using offset {offset} may result in less efficient pagination.",
UserWarning
)
def validate_datetime(datetime_str: str | None) -> None: def validate_datetime(datetime_str: str | None) -> None:
"""Validate ISO 8601 datetime format.""" """Validate ISO 8601 datetime format."""
if not datetime_str: if not datetime_str:

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.7.1" version = "0.7.3"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@@ -870,66 +870,111 @@ def test_combined_filters():
def test_sorting_by_price(): def test_sorting_by_price():
"""Test sorting by list_price - note API sorting may not be perfect""" """Test sorting by list_price with actual sort order validation"""
# Sort ascending (cheapest first) # Sort ascending (cheapest first) with multi-page limit to test concatenation
result_asc = scrape_property( result_asc = scrape_property(
location="Orlando, FL", location="Orlando, FL",
listing_type="for_sale", listing_type="for_sale",
sort_by="list_price", sort_by="list_price",
sort_direction="asc", sort_direction="asc",
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result_asc is not None and len(result_asc) > 0 assert result_asc is not None and len(result_asc) > 0
# Verify ascending sort order (allow for None/NA values at the end)
prices_asc = result_asc["list_price"].dropna().tolist()
assert len(prices_asc) > 0, "No properties with prices found"
assert prices_asc == sorted(prices_asc), f"Prices not in ascending order: {prices_asc[:10]}"
# Sort descending (most expensive first) # Sort descending (most expensive first)
result_desc = scrape_property( result_desc = scrape_property(
location="San Antonio, TX", location="San Antonio, TX",
listing_type="for_sale", listing_type="for_sale",
sort_by="list_price", sort_by="list_price",
sort_direction="desc", sort_direction="desc",
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result_desc is not None and len(result_desc) > 0 assert result_desc is not None and len(result_desc) > 0
# Note: Realtor API sorting may not be perfectly reliable for all search types # Verify descending sort order (allow for None/NA values at the end)
# The test ensures the sort parameters don't cause errors, actual sort order may vary prices_desc = result_desc["list_price"].dropna().tolist()
assert len(prices_desc) > 0, "No properties with prices found"
assert prices_desc == sorted(prices_desc, reverse=True), f"Prices not in descending order: {prices_desc[:10]}"
def test_sorting_by_date(): def test_sorting_by_date():
"""Test sorting by list_date - note API sorting may not be perfect""" """Test sorting by list_date with actual sort order validation"""
result = scrape_property( # Test descending (newest first) with multi-page limit
result_desc = scrape_property(
location="Columbus, OH", location="Columbus, OH",
listing_type="for_sale", listing_type="for_sale",
sort_by="list_date", sort_by="list_date",
sort_direction="desc", # Newest first sort_direction="desc", # Newest first
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result is not None and len(result) > 0 assert result_desc is not None and len(result_desc) > 0
# Test ensures sort parameter doesn't cause errors # Verify descending sort order (allow for None/NA values at the end)
# Note: Realtor API sorting may not be perfectly reliable for all search types dates_desc = result_desc["list_date"].dropna().tolist()
assert len(dates_desc) > 0, "No properties with dates found"
assert dates_desc == sorted(dates_desc, reverse=True), f"Dates not in descending order (newest first): {dates_desc[:10]}"
# Test ascending (oldest first)
result_asc = scrape_property(
location="Columbus, OH",
listing_type="for_sale",
sort_by="list_date",
sort_direction="asc", # Oldest first
limit=250
)
assert result_asc is not None and len(result_asc) > 0
# Verify ascending sort order
dates_asc = result_asc["list_date"].dropna().tolist()
assert len(dates_asc) > 0, "No properties with dates found"
assert dates_asc == sorted(dates_asc), f"Dates not in ascending order (oldest first): {dates_asc[:10]}"
def test_sorting_by_sqft(): def test_sorting_by_sqft():
"""Test sorting by square footage - note API sorting may not be perfect""" """Test sorting by square footage with actual sort order validation"""
result = scrape_property( # Test descending (largest first) with multi-page limit
result_desc = scrape_property(
location="Indianapolis, IN", location="Indianapolis, IN",
listing_type="for_sale", listing_type="for_sale",
sort_by="sqft", sort_by="sqft",
sort_direction="desc", # Largest first sort_direction="desc", # Largest first
limit=20 limit=250 # Multi-page to test concatenation logic
) )
assert result is not None and len(result) > 0 assert result_desc is not None and len(result_desc) > 0
# Test ensures sort parameter doesn't cause errors # Verify descending sort order (allow for None/NA values at the end)
# Note: Realtor API sorting may not be perfectly reliable for all search types sqfts_desc = result_desc["sqft"].dropna().tolist()
assert len(sqfts_desc) > 0, "No properties with sqft found"
assert sqfts_desc == sorted(sqfts_desc, reverse=True), f"Square footages not in descending order: {sqfts_desc[:10]}"
# Test ascending (smallest first)
result_asc = scrape_property(
location="Indianapolis, IN",
listing_type="for_sale",
sort_by="sqft",
sort_direction="asc", # Smallest first
limit=250
)
assert result_asc is not None and len(result_asc) > 0
# Verify ascending sort order
sqfts_asc = result_asc["sqft"].dropna().tolist()
assert len(sqfts_asc) > 0, "No properties with sqft found"
assert sqfts_asc == sorted(sqfts_asc), f"Square footages not in ascending order: {sqfts_asc[:10]}"
def test_filter_validation_errors(): def test_filter_validation_errors():
@@ -1225,3 +1270,83 @@ def test_last_status_change_date_hour_filtering():
f"PENDING property pending_date {pending_date} should be within 48 hours of {cutoff_time}" f"PENDING property pending_date {pending_date} should be within 48 hours of {cutoff_time}"
except (ValueError, TypeError): except (ValueError, TypeError):
pass # Skip if parsing fails pass # Skip if parsing fails
def test_exclude_pending_with_raw_data():
"""Test that exclude_pending parameter works correctly with return_type='raw'"""
# Query for sale properties with exclude_pending=True and raw data
result = scrape_property(
location="Phoenix, AZ",
listing_type="for_sale",
exclude_pending=True,
return_type="raw",
limit=50
)
assert result is not None and len(result) > 0
# Verify that no pending or contingent properties are in the results
for prop in result:
flags = prop.get('flags', {})
is_pending = flags.get('is_pending', False)
is_contingent = flags.get('is_contingent', False)
assert not is_pending, f"Property {prop.get('property_id')} should not be pending when exclude_pending=True"
assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent when exclude_pending=True"
def test_mls_only_with_raw_data():
"""Test that mls_only parameter works correctly with return_type='raw'"""
# Query with mls_only=True and raw data
result = scrape_property(
location="Dallas, TX",
listing_type="for_sale",
mls_only=True,
return_type="raw",
limit=50
)
assert result is not None and len(result) > 0
# Verify that all properties have MLS IDs (stored in source.id)
for prop in result:
source = prop.get('source', {})
mls_id = source.get('id') if source else None
assert mls_id is not None and mls_id != "", \
f"Property {prop.get('property_id')} should have an MLS ID (source.id) when mls_only=True, got: {mls_id}"
def test_combined_filters_with_raw_data():
"""Test that both exclude_pending and mls_only work together with return_type='raw'"""
# Query with both filters enabled and raw data
result = scrape_property(
location="Austin, TX",
listing_type="for_sale",
exclude_pending=True,
mls_only=True,
return_type="raw",
limit=30
)
assert result is not None and len(result) > 0
# Verify both filters are applied
for prop in result:
# Check exclude_pending filter
flags = prop.get('flags', {})
is_pending = flags.get('is_pending', False)
is_contingent = flags.get('is_contingent', False)
assert not is_pending, f"Property {prop.get('property_id')} should not be pending"
assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent"
# Check mls_only filter
source = prop.get('source', {})
mls_id = source.get('id') if source else None
assert mls_id is not None and mls_id != "", \
f"Property {prop.get('property_id')} should have an MLS ID (source.id)"