Add comprehensive filtering system with hour-based datetime and property filters

This major enhancement addresses user needs for more precise filtering and introduces
powerful new capabilities for property searches:

Key Features:
- Hour-based date filtering (past_hours, datetime_from/to with ISO 8601 support)
- Server-side property filters (beds, baths, sqft, price, lot_sqft, year_built)
- Sorting support (list_date, sold_date, list_price, sqft, beds, baths)
- Full timestamp preservation (YYYY-MM-DD HH:MM:SS instead of date-only)
- Comprehensive validation with helpful error messages

Technical Changes:
- Preserve full datetime precision in processors.py and parsers.py
- Implement client-side hour-based post-filtering for all listing types
- Add server-side GraphQL filters for property characteristics
- Generalize filtering to work across SOLD, PENDING, FOR_SALE, FOR_RENT
- Add 15 comprehensive tests covering all new features
- Maintain full backward compatibility with existing parameters

Fixes #113 (sorting support)

Version bump to 0.7.0 reflects significant new functionality while maintaining
backward compatibility.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Zachary Hampton
2025-10-20 14:21:12 -07:00
parent c9b05ebd9d
commit 18815e4207
9 changed files with 1009 additions and 34 deletions

View File

@@ -1,7 +1,7 @@
import warnings
import pandas as pd
from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_datetime, validate_filters, validate_sort
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
from typing import Union, Optional, List
@@ -15,15 +15,36 @@ def scrape_property(
mls_only: bool = False,
past_days: int = None,
proxy: str = None,
date_from: str = None, #: TODO: Switch to one parameter, Date, with date_from and date_to, pydantic validation
date_from: str = None,
date_to: str = None,
foreclosure: bool = None,
extra_property_data: bool = True,
exclude_pending: bool = False,
limit: int = 10000
limit: int = 10000,
# New date/time filtering parameters
past_hours: int = None,
datetime_from: str = None,
datetime_to: str = None,
# New property filtering parameters
beds_min: int = None,
beds_max: int = None,
baths_min: float = None,
baths_max: float = None,
sqft_min: int = None,
sqft_max: int = None,
price_min: int = None,
price_max: int = None,
lot_sqft_min: int = None,
lot_sqft_max: int = None,
year_built_min: int = None,
year_built_max: int = None,
# New sorting parameters
sort_by: str = None,
sort_direction: str = "desc",
) -> Union[pd.DataFrame, list[dict], list[Property]]:
"""
Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param return_type: Return type (pandas, pydantic, raw)
@@ -40,10 +61,29 @@ def scrape_property(
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
:param limit: Limit the number of results returned. Maximum is 10,000.
New parameters:
:param past_hours: Get properties in the last _ hours (requires client-side filtering)
:param datetime_from, datetime_to: ISO 8601 datetime strings for precise time filtering (e.g. "2025-01-20T14:30:00")
:param beds_min, beds_max: Filter by number of bedrooms
:param baths_min, baths_max: Filter by number of bathrooms
:param sqft_min, sqft_max: Filter by square footage
:param price_min, price_max: Filter by listing price
:param lot_sqft_min, lot_sqft_max: Filter by lot size
:param year_built_min, year_built_max: Filter by year built
:param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths)
:param sort_direction: Sort direction (asc, desc)
"""
validate_input(listing_type)
validate_dates(date_from, date_to)
validate_limit(limit)
validate_datetime(datetime_from)
validate_datetime(datetime_to)
validate_filters(
beds_min, beds_max, baths_min, baths_max, sqft_min, sqft_max,
price_min, price_max, lot_sqft_min, lot_sqft_max, year_built_min, year_built_max
)
validate_sort(sort_by, sort_direction)
scraper_input = ScraperInput(
location=location,
@@ -60,6 +100,26 @@ def scrape_property(
extra_property_data=extra_property_data,
exclude_pending=exclude_pending,
limit=limit,
# New date/time filtering
past_hours=past_hours,
datetime_from=datetime_from,
datetime_to=datetime_to,
# New property filtering
beds_min=beds_min,
beds_max=beds_max,
baths_min=baths_min,
baths_max=baths_max,
sqft_min=sqft_min,
sqft_max=sqft_max,
price_min=price_min,
price_max=price_max,
lot_sqft_min=lot_sqft_min,
lot_sqft_max=lot_sqft_max,
year_built_min=year_built_min,
year_built_max=year_built_max,
# New sorting
sort_by=sort_by,
sort_direction=sort_direction,
)
site = RealtorScraper(scraper_input)