Add comprehensive filtering system with hour-based datetime and property filters

This major enhancement addresses user needs for more precise filtering and introduces
powerful new capabilities for property searches:

Key Features:
- Hour-based date filtering (past_hours, datetime_from/to with ISO 8601 support)
- Server-side property filters (beds, baths, sqft, price, lot_sqft, year_built)
- Sorting support (list_date, sold_date, list_price, sqft, beds, baths)
- Full timestamp preservation (YYYY-MM-DD HH:MM:SS instead of date-only)
- Comprehensive validation with helpful error messages

Technical Changes:
- Preserve full datetime precision in processors.py and parsers.py
- Implement client-side hour-based post-filtering for all listing types
- Add server-side GraphQL filters for property characteristics
- Generalize filtering to work across SOLD, PENDING, FOR_SALE, FOR_RENT
- Add 15 comprehensive tests covering all new features
- Maintain full backward compatibility with existing parameters

Fixes #113 (sorting support)

Version bump to 0.7.0 reflects significant new functionality while maintaining
backward compatibility.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Zachary Hampton
2025-10-20 14:21:12 -07:00
parent c9b05ebd9d
commit 18815e4207
9 changed files with 1009 additions and 34 deletions

View File

@@ -1,7 +1,7 @@
import warnings
import pandas as pd
from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_datetime, validate_filters, validate_sort
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
from typing import Union, Optional, List
@@ -15,15 +15,36 @@ def scrape_property(
mls_only: bool = False,
past_days: int = None,
proxy: str = None,
date_from: str = None, #: TODO: Switch to one parameter, Date, with date_from and date_to, pydantic validation
date_from: str = None,
date_to: str = None,
foreclosure: bool = None,
extra_property_data: bool = True,
exclude_pending: bool = False,
limit: int = 10000
limit: int = 10000,
# New date/time filtering parameters
past_hours: int = None,
datetime_from: str = None,
datetime_to: str = None,
# New property filtering parameters
beds_min: int = None,
beds_max: int = None,
baths_min: float = None,
baths_max: float = None,
sqft_min: int = None,
sqft_max: int = None,
price_min: int = None,
price_max: int = None,
lot_sqft_min: int = None,
lot_sqft_max: int = None,
year_built_min: int = None,
year_built_max: int = None,
# New sorting parameters
sort_by: str = None,
sort_direction: str = "desc",
) -> Union[pd.DataFrame, list[dict], list[Property]]:
"""
Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param return_type: Return type (pandas, pydantic, raw)
@@ -40,10 +61,29 @@ def scrape_property(
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
:param limit: Limit the number of results returned. Maximum is 10,000.
New parameters:
:param past_hours: Get properties in the last _ hours (requires client-side filtering)
:param datetime_from, datetime_to: ISO 8601 datetime strings for precise time filtering (e.g. "2025-01-20T14:30:00")
:param beds_min, beds_max: Filter by number of bedrooms
:param baths_min, baths_max: Filter by number of bathrooms
:param sqft_min, sqft_max: Filter by square footage
:param price_min, price_max: Filter by listing price
:param lot_sqft_min, lot_sqft_max: Filter by lot size
:param year_built_min, year_built_max: Filter by year built
:param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths)
:param sort_direction: Sort direction (asc, desc)
"""
validate_input(listing_type)
validate_dates(date_from, date_to)
validate_limit(limit)
validate_datetime(datetime_from)
validate_datetime(datetime_to)
validate_filters(
beds_min, beds_max, baths_min, baths_max, sqft_min, sqft_max,
price_min, price_max, lot_sqft_min, lot_sqft_max, year_built_min, year_built_max
)
validate_sort(sort_by, sort_direction)
scraper_input = ScraperInput(
location=location,
@@ -60,6 +100,26 @@ def scrape_property(
extra_property_data=extra_property_data,
exclude_pending=exclude_pending,
limit=limit,
# New date/time filtering
past_hours=past_hours,
datetime_from=datetime_from,
datetime_to=datetime_to,
# New property filtering
beds_min=beds_min,
beds_max=beds_max,
baths_min=baths_min,
baths_max=baths_max,
sqft_min=sqft_min,
sqft_max=sqft_max,
price_min=price_min,
price_max=price_max,
lot_sqft_min=lot_sqft_min,
lot_sqft_max=lot_sqft_max,
year_built_min=year_built_min,
year_built_max=year_built_max,
# New sorting
sort_by=sort_by,
sort_direction=sort_direction,
)
site = RealtorScraper(scraper_input)

View File

@@ -27,6 +27,29 @@ class ScraperInput(BaseModel):
limit: int = 10000
return_type: ReturnType = ReturnType.pandas
# New date/time filtering parameters
past_hours: int | None = None
datetime_from: str | None = None
datetime_to: str | None = None
# New property filtering parameters
beds_min: int | None = None
beds_max: int | None = None
baths_min: float | None = None
baths_max: float | None = None
sqft_min: int | None = None
sqft_max: int | None = None
price_min: int | None = None
price_max: int | None = None
lot_sqft_min: int | None = None
lot_sqft_max: int | None = None
year_built_min: int | None = None
year_built_max: int | None = None
# New sorting parameters
sort_by: str | None = None
sort_direction: str = "desc"
class Scraper:
session = None
@@ -85,6 +108,29 @@ class Scraper:
self.limit = scraper_input.limit
self.return_type = scraper_input.return_type
# New date/time filtering
self.past_hours = scraper_input.past_hours
self.datetime_from = scraper_input.datetime_from
self.datetime_to = scraper_input.datetime_to
# New property filtering
self.beds_min = scraper_input.beds_min
self.beds_max = scraper_input.beds_max
self.baths_min = scraper_input.baths_min
self.baths_max = scraper_input.baths_max
self.sqft_min = scraper_input.sqft_min
self.sqft_max = scraper_input.sqft_max
self.price_min = scraper_input.price_min
self.price_max = scraper_input.price_max
self.lot_sqft_min = scraper_input.lot_sqft_min
self.lot_sqft_max = scraper_input.lot_sqft_max
self.year_built_min = scraper_input.year_built_min
self.year_built_max = scraper_input.year_built_max
# New sorting
self.sort_by = scraper_input.sort_by
self.sort_direction = scraper_input.sort_direction
def search(self) -> list[Union[Property | dict]]: ...
@staticmethod

View File

@@ -132,36 +132,138 @@ class RealtorScraper(Scraper):
"""
date_param = ""
# Determine date field based on listing type
if self.listing_type == ListingType.SOLD:
if self.date_from and self.date_to:
date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
elif self.last_x_days:
date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}'
elif self.listing_type == ListingType.PENDING:
# Skip server-side date filtering for PENDING as both pending_date and contract_date
date_field = "sold_date"
elif self.listing_type in [ListingType.FOR_SALE, ListingType.FOR_RENT]:
date_field = "list_date"
else: # PENDING
# Skip server-side date filtering for PENDING as both pending_date and contract_date
# filters are broken in the API. Client-side filtering will be applied later.
pass
else:
if self.date_from and self.date_to:
date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
date_field = None
# Build date parameter (expand to full days if hour-based filtering is used)
if date_field:
if self.datetime_from or self.datetime_to:
# Hour-based datetime filtering: extract date parts for API, client-side filter by hours
from datetime import datetime
min_date = None
max_date = None
if self.datetime_from:
try:
dt_from = datetime.fromisoformat(self.datetime_from.replace('Z', '+00:00'))
min_date = dt_from.strftime("%Y-%m-%d")
except (ValueError, AttributeError):
pass
if self.datetime_to:
try:
dt_to = datetime.fromisoformat(self.datetime_to.replace('Z', '+00:00'))
max_date = dt_to.strftime("%Y-%m-%d")
except (ValueError, AttributeError):
pass
if min_date and max_date:
date_param = f'{date_field}: {{ min: "{min_date}", max: "{max_date}" }}'
elif min_date:
date_param = f'{date_field}: {{ min: "{min_date}" }}'
elif max_date:
date_param = f'{date_field}: {{ max: "{max_date}" }}'
elif self.past_hours:
# Query API for past N days (minimum 1 day), client-side filter by hours
days = max(1, int(self.past_hours / 24) + 1) # Round up to cover the full period
date_param = f'{date_field}: {{ min: "$today-{days}D" }}'
elif self.date_from and self.date_to:
date_param = f'{date_field}: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
elif self.last_x_days:
date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}'
date_param = f'{date_field}: {{ min: "$today-{self.last_x_days}D" }}'
property_type_param = ""
if self.property_type:
property_types = [pt.value for pt in self.property_type]
property_type_param = f"type: {json.dumps(property_types)}"
sort_param = (
"sort: [{ field: sold_date, direction: desc }]"
if self.listing_type == ListingType.SOLD
else "" #: "sort: [{ field: list_date, direction: desc }]" #: prioritize normal fractal sort from realtor
)
# Build property filter parameters
property_filters = []
if self.beds_min is not None or self.beds_max is not None:
beds_filter = "beds: {"
if self.beds_min is not None:
beds_filter += f" min: {self.beds_min}"
if self.beds_max is not None:
beds_filter += f" max: {self.beds_max}"
beds_filter += " }"
property_filters.append(beds_filter)
if self.baths_min is not None or self.baths_max is not None:
baths_filter = "baths: {"
if self.baths_min is not None:
baths_filter += f" min: {self.baths_min}"
if self.baths_max is not None:
baths_filter += f" max: {self.baths_max}"
baths_filter += " }"
property_filters.append(baths_filter)
if self.sqft_min is not None or self.sqft_max is not None:
sqft_filter = "sqft: {"
if self.sqft_min is not None:
sqft_filter += f" min: {self.sqft_min}"
if self.sqft_max is not None:
sqft_filter += f" max: {self.sqft_max}"
sqft_filter += " }"
property_filters.append(sqft_filter)
if self.price_min is not None or self.price_max is not None:
price_filter = "list_price: {"
if self.price_min is not None:
price_filter += f" min: {self.price_min}"
if self.price_max is not None:
price_filter += f" max: {self.price_max}"
price_filter += " }"
property_filters.append(price_filter)
if self.lot_sqft_min is not None or self.lot_sqft_max is not None:
lot_sqft_filter = "lot_sqft: {"
if self.lot_sqft_min is not None:
lot_sqft_filter += f" min: {self.lot_sqft_min}"
if self.lot_sqft_max is not None:
lot_sqft_filter += f" max: {self.lot_sqft_max}"
lot_sqft_filter += " }"
property_filters.append(lot_sqft_filter)
if self.year_built_min is not None or self.year_built_max is not None:
year_built_filter = "year_built: {"
if self.year_built_min is not None:
year_built_filter += f" min: {self.year_built_min}"
if self.year_built_max is not None:
year_built_filter += f" max: {self.year_built_max}"
year_built_filter += " }"
property_filters.append(year_built_filter)
property_filters_param = "\n".join(property_filters)
# Build sort parameter
if self.sort_by:
sort_param = f"sort: [{{ field: {self.sort_by}, direction: {self.sort_direction} }}]"
elif self.listing_type == ListingType.SOLD:
sort_param = "sort: [{ field: sold_date, direction: desc }]"
else:
sort_param = "" #: prioritize normal fractal sort from realtor
pending_or_contingent_param = (
"or_filters: { contingent: true, pending: true }" if self.listing_type == ListingType.PENDING else ""
)
# Build bucket parameter (only use fractal sort if no custom sort is specified)
bucket_param = ""
if not self.sort_by:
bucket_param = 'bucket: { sort: "fractal_v1.1.3_fr" }'
listing_type = ListingType.FOR_SALE if self.listing_type == ListingType.PENDING else self.listing_type
is_foreclosure = ""
@@ -187,6 +289,7 @@ class RealtorScraper(Scraper):
%s
%s
%s
%s
}
%s
limit: 200
@@ -197,6 +300,7 @@ class RealtorScraper(Scraper):
listing_type.value.lower(),
date_param,
property_type_param,
property_filters_param,
pending_or_contingent_param,
sort_param,
GENERAL_RESULTS_QUERY,
@@ -220,8 +324,9 @@ class RealtorScraper(Scraper):
%s
%s
%s
%s
}
bucket: { sort: "fractal_v1.1.3_fr" }
%s
%s
limit: 200
offset: $offset
@@ -231,7 +336,9 @@ class RealtorScraper(Scraper):
listing_type.value.lower(),
date_param,
property_type_param,
property_filters_param,
pending_or_contingent_param,
bucket_param,
sort_param,
GENERAL_RESULTS_QUERY,
)
@@ -382,13 +489,111 @@ class RealtorScraper(Scraper):
for future in as_completed(futures):
homes.extend(future.result()["properties"])
# Apply client-side hour-based filtering if needed
# (API only supports day-level filtering, so we post-filter for hour precision)
if self.past_hours or self.datetime_from or self.datetime_to:
homes = self._apply_hour_based_date_filter(homes)
# Apply client-side date filtering for PENDING properties
# (server-side filters are broken in the API)
if self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
homes = self._apply_pending_date_filter(homes)
return homes
def _apply_hour_based_date_filter(self, homes):
"""Apply client-side hour-based date filtering for all listing types.
This is used when past_hours, datetime_from, or datetime_to are specified,
since the API only supports day-level filtering.
"""
if not homes:
return homes
from datetime import datetime, timedelta
# Determine date range with hour precision
date_range = None
if self.past_hours:
cutoff_datetime = datetime.now() - timedelta(hours=self.past_hours)
date_range = {'type': 'since', 'date': cutoff_datetime}
elif self.datetime_from or self.datetime_to:
try:
from_datetime = None
to_datetime = None
if self.datetime_from:
from_datetime_str = self.datetime_from.replace('Z', '+00:00') if self.datetime_from.endswith('Z') else self.datetime_from
from_datetime = datetime.fromisoformat(from_datetime_str).replace(tzinfo=None)
if self.datetime_to:
to_datetime_str = self.datetime_to.replace('Z', '+00:00') if self.datetime_to.endswith('Z') else self.datetime_to
to_datetime = datetime.fromisoformat(to_datetime_str).replace(tzinfo=None)
if from_datetime and to_datetime:
date_range = {'type': 'range', 'from_date': from_datetime, 'to_date': to_datetime}
elif from_datetime:
date_range = {'type': 'since', 'date': from_datetime}
elif to_datetime:
date_range = {'type': 'until', 'date': to_datetime}
except (ValueError, AttributeError):
return homes # If parsing fails, return unfiltered
if not date_range:
return homes
# Determine which date field to use based on listing type
date_field_name = self._get_date_field_for_listing_type()
filtered_homes = []
for home in homes:
# Extract the appropriate date for this property
property_date = self._extract_date_from_home(home, date_field_name)
# Handle properties without dates
if property_date is None:
# For PENDING, include contingent properties without pending_date
if self.listing_type == ListingType.PENDING and self._is_contingent(home):
filtered_homes.append(home)
continue
# Check if property date falls within the specified range
if self._is_datetime_in_range(property_date, date_range):
filtered_homes.append(home)
return filtered_homes
def _get_date_field_for_listing_type(self):
"""Get the appropriate date field name for the current listing type."""
if self.listing_type == ListingType.SOLD:
return 'last_sold_date'
elif self.listing_type == ListingType.PENDING:
return 'pending_date'
else: # FOR_SALE or FOR_RENT
return 'list_date'
def _extract_date_from_home(self, home, date_field_name):
"""Extract a date field from a home (handles both dict and Property object)."""
if isinstance(home, dict):
date_value = home.get(date_field_name)
else:
date_value = getattr(home, date_field_name, None)
if date_value:
return self._parse_date_value(date_value)
return None
def _is_datetime_in_range(self, date_obj, date_range):
"""Check if a datetime object falls within the specified date range (with hour precision)."""
if date_range['type'] == 'since':
return date_obj >= date_range['date']
elif date_range['type'] == 'until':
return date_obj <= date_range['date']
elif date_range['type'] == 'range':
return date_range['from_date'] <= date_obj <= date_range['to_date']
return False
def _apply_pending_date_filter(self, homes):
"""Apply client-side date filtering for PENDING properties based on pending_date field.
For contingent properties without pending_date, tries fallback date fields."""

View File

@@ -250,9 +250,28 @@ def parse_description(result: dict) -> Description | None:
def calculate_days_on_mls(result: dict) -> Optional[int]:
"""Calculate days on MLS from result data"""
list_date_str = result.get("list_date")
list_date = datetime.strptime(list_date_str.split("T")[0], "%Y-%m-%d") if list_date_str else None
list_date = None
if list_date_str:
try:
# Parse full datetime, then use date() for day calculation
list_date_str_clean = list_date_str.replace('Z', '+00:00') if list_date_str.endswith('Z') else list_date_str
list_date = datetime.fromisoformat(list_date_str_clean).replace(tzinfo=None)
except (ValueError, AttributeError):
# Fallback for date-only format
list_date = datetime.strptime(list_date_str.split("T")[0], "%Y-%m-%d") if "T" in list_date_str else None
last_sold_date_str = result.get("last_sold_date")
last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None
last_sold_date = None
if last_sold_date_str:
try:
last_sold_date_str_clean = last_sold_date_str.replace('Z', '+00:00') if last_sold_date_str.endswith('Z') else last_sold_date_str
last_sold_date = datetime.fromisoformat(last_sold_date_str_clean).replace(tzinfo=None)
except (ValueError, AttributeError):
# Fallback for date-only format
try:
last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d")
except ValueError:
last_sold_date = None
today = datetime.now()
if list_date:

View File

@@ -121,10 +121,10 @@ def process_property(result: dict, mls_only: bool = False, extra_property_data:
list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=(datetime.fromisoformat(result["list_date"].split("T")[0]) if result.get("list_date") else None),
list_date=(datetime.fromisoformat(result["list_date"].replace('Z', '+00:00') if result["list_date"].endswith('Z') else result["list_date"]) if result.get("list_date") else None),
prc_sqft=result.get("price_per_sqft"),
last_sold_date=(datetime.fromisoformat(result["last_sold_date"]) if result.get("last_sold_date") else None),
pending_date=(datetime.fromisoformat(result["pending_date"].split("T")[0]) if result.get("pending_date") else None),
last_sold_date=(datetime.fromisoformat(result["last_sold_date"].replace('Z', '+00:00') if result["last_sold_date"].endswith('Z') else result["last_sold_date"]) if result.get("last_sold_date") else None),
pending_date=(datetime.fromisoformat(result["pending_date"].replace('Z', '+00:00') if result["pending_date"].endswith('Z') else result["pending_date"]) if result.get("pending_date") else None),
new_construction=result["flags"].get("is_new_construction") is True,
hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None),
latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None),

View File

@@ -119,10 +119,10 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
# Convert datetime objects to strings for CSV
# Convert datetime objects to strings for CSV (preserve full datetime including time)
for date_field in ["list_date", "pending_date", "last_sold_date"]:
if prop_data.get(date_field):
prop_data[date_field] = prop_data[date_field].strftime("%Y-%m-%d") if hasattr(prop_data[date_field], 'strftime') else prop_data[date_field]
prop_data[date_field] = prop_data[date_field].strftime("%Y-%m-%d %H:%M:%S") if hasattr(prop_data[date_field], 'strftime') else prop_data[date_field]
# Convert HttpUrl objects to strings for CSV
if prop_data.get("property_url"):
@@ -179,3 +179,65 @@ def validate_limit(limit: int) -> None:
if limit is not None and (limit < 1 or limit > 10000):
raise ValueError("Property limit must be between 1 and 10,000.")
def validate_datetime(datetime_str: str | None) -> None:
"""Validate ISO 8601 datetime format."""
if not datetime_str:
return
try:
# Try parsing as ISO 8601 datetime
datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
except (ValueError, AttributeError):
raise InvalidDate(
f"Invalid datetime format: '{datetime_str}'. "
f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')."
)
def validate_filters(
beds_min: int | None = None,
beds_max: int | None = None,
baths_min: float | None = None,
baths_max: float | None = None,
sqft_min: int | None = None,
sqft_max: int | None = None,
price_min: int | None = None,
price_max: int | None = None,
lot_sqft_min: int | None = None,
lot_sqft_max: int | None = None,
year_built_min: int | None = None,
year_built_max: int | None = None,
) -> None:
"""Validate that min values are less than max values for range filters."""
ranges = [
("beds", beds_min, beds_max),
("baths", baths_min, baths_max),
("sqft", sqft_min, sqft_max),
("price", price_min, price_max),
("lot_sqft", lot_sqft_min, lot_sqft_max),
("year_built", year_built_min, year_built_max),
]
for name, min_val, max_val in ranges:
if min_val is not None and max_val is not None and min_val > max_val:
raise ValueError(f"{name}_min ({min_val}) cannot be greater than {name}_max ({max_val}).")
def validate_sort(sort_by: str | None, sort_direction: str | None = "desc") -> None:
"""Validate sort parameters."""
valid_sort_fields = ["list_date", "sold_date", "list_price", "sqft", "beds", "baths"]
valid_directions = ["asc", "desc"]
if sort_by and sort_by not in valid_sort_fields:
raise ValueError(
f"Invalid sort_by value: '{sort_by}'. "
f"Valid options: {', '.join(valid_sort_fields)}"
)
if sort_direction and sort_direction not in valid_directions:
raise ValueError(
f"Invalid sort_direction value: '{sort_direction}'. "
f"Valid options: {', '.join(valid_directions)}"
)