Add last_update_date filtering and improve time interface DX

Part A: Add last_update_date filtering (client-side)
- Add updated_since parameter (accepts datetime object or ISO string)
- Add updated_in_past_hours parameter (accepts int or timedelta)
- Implement _apply_last_update_date_filter() method for client-side filtering
- Add mutual exclusion validation for updated_* parameters

Part B: Improve time interface DX
- Accept datetime/timedelta objects for datetime_from, datetime_to
- Accept timedelta objects for past_hours, past_days
- Add type conversion helper functions in utils.py
- Improve validation error messages with specific examples
- Update validate_datetime to accept datetime objects

Helper functions added:
- convert_to_datetime_string() - Converts datetime objects to ISO strings
- extract_timedelta_hours() - Extracts hours from timedelta objects
- extract_timedelta_days() - Extracts days from timedelta objects
- validate_last_update_filters() - Validates last_update_date parameters

All changes are backward compatible - existing string/int parameters still work.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Zachary Hampton
2025-11-11 12:00:15 -08:00
parent 3a0e91b876
commit a6fe0d2675
4 changed files with 237 additions and 21 deletions

View File

@@ -1,7 +1,12 @@
import warnings import warnings
import pandas as pd import pandas as pd
from datetime import datetime, timedelta
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_offset, validate_datetime, validate_filters, validate_sort from .utils import (
process_result, ordered_properties, validate_input, validate_dates, validate_limit,
validate_offset, validate_datetime, validate_filters, validate_sort, validate_last_update_filters,
convert_to_datetime_string, extract_timedelta_hours, extract_timedelta_days
)
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
from typing import Union, Optional, List from typing import Union, Optional, List
@@ -13,7 +18,7 @@ def scrape_property(
property_type: Optional[List[str]] = None, property_type: Optional[List[str]] = None,
radius: float = None, radius: float = None,
mls_only: bool = False, mls_only: bool = False,
past_days: int = None, past_days: int | timedelta = None,
proxy: str = None, proxy: str = None,
date_from: str = None, date_from: str = None,
date_to: str = None, date_to: str = None,
@@ -23,9 +28,12 @@ def scrape_property(
limit: int = 10000, limit: int = 10000,
offset: int = 0, offset: int = 0,
# New date/time filtering parameters # New date/time filtering parameters
past_hours: int = None, past_hours: int | timedelta = None,
datetime_from: str = None, datetime_from: datetime | str = None,
datetime_to: str = None, datetime_to: datetime | str = None,
# New last_update_date filtering parameters
updated_since: datetime | str = None,
updated_in_past_hours: int | timedelta = None,
# New property filtering parameters # New property filtering parameters
beds_min: int = None, beds_min: int = None,
beds_max: int = None, beds_max: int = None,
@@ -67,8 +75,10 @@ def scrape_property(
:param offset: Starting position for pagination within the 10k limit (offset + limit cannot exceed 10,000). Use with limit to fetch results in chunks (e.g., offset=200, limit=200 fetches results 200-399). Should be a multiple of 200 (page size) for optimal performance. Default is 0. Note: Cannot be used to bypass the 10k API limit - use date ranges (date_from/date_to) to narrow searches and fetch more data. :param offset: Starting position for pagination within the 10k limit (offset + limit cannot exceed 10,000). Use with limit to fetch results in chunks (e.g., offset=200, limit=200 fetches results 200-399). Should be a multiple of 200 (page size) for optimal performance. Default is 0. Note: Cannot be used to bypass the 10k API limit - use date ranges (date_from/date_to) to narrow searches and fetch more data.
New parameters: New parameters:
:param past_hours: Get properties in the last _ hours (requires client-side filtering) :param past_hours: Get properties in the last _ hours (requires client-side filtering). Accepts int or timedelta.
:param datetime_from, datetime_to: ISO 8601 datetime strings for precise time filtering (e.g. "2025-01-20T14:30:00") :param datetime_from, datetime_to: Precise time filtering. Accepts datetime objects or ISO 8601 strings (e.g. "2025-01-20T14:30:00")
:param updated_since: Filter by last_update_date (when property was last updated). Accepts datetime object or ISO 8601 string (client-side filtering)
:param updated_in_past_hours: Filter by properties updated in the last _ hours. Accepts int or timedelta (client-side filtering)
:param beds_min, beds_max: Filter by number of bedrooms :param beds_min, beds_max: Filter by number of bedrooms
:param baths_min, baths_max: Filter by number of bathrooms :param baths_min, baths_max: Filter by number of bathrooms
:param sqft_min, sqft_max: Filter by square footage :param sqft_min, sqft_max: Filter by square footage
@@ -77,6 +87,8 @@ def scrape_property(
:param year_built_min, year_built_max: Filter by year built :param year_built_min, year_built_max: Filter by year built
:param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date) :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
:param sort_direction: Sort direction (asc, desc) :param sort_direction: Sort direction (asc, desc)
Note: past_days and past_hours also accept timedelta objects for more Pythonic usage.
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
@@ -90,6 +102,12 @@ def scrape_property(
) )
validate_sort(sort_by, sort_direction) validate_sort(sort_by, sort_direction)
# Validate new last_update_date filtering parameters
validate_last_update_filters(
convert_to_datetime_string(updated_since),
extract_timedelta_hours(updated_in_past_hours)
)
# Convert listing_type to appropriate format # Convert listing_type to appropriate format
if listing_type is None: if listing_type is None:
converted_listing_type = None converted_listing_type = None
@@ -98,6 +116,14 @@ def scrape_property(
else: else:
converted_listing_type = ListingType(listing_type.upper()) converted_listing_type = ListingType(listing_type.upper())
# Convert datetime/timedelta objects to appropriate formats
converted_past_days = extract_timedelta_days(past_days)
converted_past_hours = extract_timedelta_hours(past_hours)
converted_datetime_from = convert_to_datetime_string(datetime_from)
converted_datetime_to = convert_to_datetime_string(datetime_to)
converted_updated_since = convert_to_datetime_string(updated_since)
converted_updated_in_past_hours = extract_timedelta_hours(updated_in_past_hours)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=converted_listing_type, listing_type=converted_listing_type,
@@ -106,7 +132,7 @@ def scrape_property(
proxy=proxy, proxy=proxy,
radius=radius, radius=radius,
mls_only=mls_only, mls_only=mls_only,
last_x_days=past_days, last_x_days=converted_past_days,
date_from=date_from, date_from=date_from,
date_to=date_to, date_to=date_to,
foreclosure=foreclosure, foreclosure=foreclosure,
@@ -115,9 +141,12 @@ def scrape_property(
limit=limit, limit=limit,
offset=offset, offset=offset,
# New date/time filtering # New date/time filtering
past_hours=past_hours, past_hours=converted_past_hours,
datetime_from=datetime_from, datetime_from=converted_datetime_from,
datetime_to=datetime_to, datetime_to=converted_datetime_to,
# New last_update_date filtering
updated_since=converted_updated_since,
updated_in_past_hours=converted_updated_in_past_hours,
# New property filtering # New property filtering
beds_min=beds_min, beds_min=beds_min,
beds_max=beds_max, beds_max=beds_max,

View File

@@ -33,6 +33,10 @@ class ScraperInput(BaseModel):
datetime_from: str | None = None datetime_from: str | None = None
datetime_to: str | None = None datetime_to: str | None = None
# New last_update_date filtering parameters
updated_since: str | None = None
updated_in_past_hours: int | None = None
# New property filtering parameters # New property filtering parameters
beds_min: int | None = None beds_min: int | None = None
beds_max: int | None = None beds_max: int | None = None
@@ -115,6 +119,10 @@ class Scraper:
self.datetime_from = scraper_input.datetime_from self.datetime_from = scraper_input.datetime_from
self.datetime_to = scraper_input.datetime_to self.datetime_to = scraper_input.datetime_to
# New last_update_date filtering
self.updated_since = scraper_input.updated_since
self.updated_in_past_hours = scraper_input.updated_in_past_hours
# New property filtering # New property filtering
self.beds_min = scraper_input.beds_min self.beds_min = scraper_input.beds_min
self.beds_max = scraper_input.beds_max self.beds_max = scraper_input.beds_max

View File

@@ -558,6 +558,10 @@ class RealtorScraper(Scraper):
elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from): elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
homes = self._apply_pending_date_filter(homes) homes = self._apply_pending_date_filter(homes)
# Apply client-side filtering by last_update_date if specified
if self.updated_since or self.updated_in_past_hours:
homes = self._apply_last_update_date_filter(homes)
# Apply client-side sort to ensure results are properly ordered # Apply client-side sort to ensure results are properly ordered
# This is necessary after filtering and to guarantee sort order across page boundaries # This is necessary after filtering and to guarantee sort order across page boundaries
if self.sort_by: if self.sort_by:
@@ -729,7 +733,51 @@ class RealtorScraper(Scraper):
if hasattr(home, 'flags') and home.flags: if hasattr(home, 'flags') and home.flags:
return getattr(home.flags, 'is_contingent', False) return getattr(home.flags, 'is_contingent', False)
return False return False
def _apply_last_update_date_filter(self, homes):
"""Apply client-side filtering by last_update_date.
This is used when updated_since or updated_in_past_hours are specified.
Filters properties based on when they were last updated.
"""
if not homes:
return homes
from datetime import datetime, timedelta
# Determine date range for last_update_date filtering
date_range = None
if self.updated_in_past_hours:
cutoff_datetime = datetime.now() - timedelta(hours=self.updated_in_past_hours)
date_range = {'type': 'since', 'date': cutoff_datetime}
elif self.updated_since:
try:
since_datetime_str = self.updated_since.replace('Z', '+00:00') if self.updated_since.endswith('Z') else self.updated_since
since_datetime = datetime.fromisoformat(since_datetime_str).replace(tzinfo=None)
date_range = {'type': 'since', 'date': since_datetime}
except (ValueError, AttributeError):
return homes # If parsing fails, return unfiltered
if not date_range:
return homes
filtered_homes = []
for home in homes:
# Extract last_update_date from the property
property_date = self._extract_date_from_home(home, 'last_update_date')
# Skip properties without last_update_date
if property_date is None:
continue
# Check if property date falls within the specified range
if self._is_datetime_in_range(property_date, date_range):
filtered_homes.append(home)
return filtered_homes
def _get_date_range(self): def _get_date_range(self):
"""Get the date range for filtering based on instance parameters.""" """Get the date range for filtering based on instance parameters."""
from datetime import datetime, timedelta from datetime import datetime, timedelta

View File

@@ -172,7 +172,7 @@ def validate_input(listing_type: str | list[str] | None) -> None:
def validate_dates(date_from: str | None, date_to: str | None) -> None: def validate_dates(date_from: str | None, date_to: str | None) -> None:
if isinstance(date_from, str) != isinstance(date_to, str): if isinstance(date_from, str) != isinstance(date_to, str):
raise InvalidDate("Both date_from and date_to must be provided.") raise InvalidDate("Both date_from and date_to must be provided together.")
if date_from and date_to: if date_from and date_to:
try: try:
@@ -180,9 +180,16 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None:
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d") date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")
if date_to_obj < date_from_obj: if date_to_obj < date_from_obj:
raise InvalidDate("date_to must be after date_from.") raise InvalidDate(f"date_to ('{date_to}') must be after date_from ('{date_from}').")
except ValueError: except ValueError as e:
raise InvalidDate(f"Invalid date format or range") # Provide specific guidance on the expected format
if "does not match format" in str(e):
raise InvalidDate(
f"Invalid date format. Expected 'YYYY-MM-DD' format. "
f"Examples: '2025-01-20', '2024-12-31'. "
f"Got: date_from='{date_from}', date_to='{date_to}'"
)
raise InvalidDate(f"Invalid date format or range: {e}")
def validate_limit(limit: int) -> None: def validate_limit(limit: int) -> None:
@@ -222,21 +229,53 @@ def validate_offset(offset: int, limit: int = 10000) -> None:
) )
def validate_datetime(datetime_str: str | None) -> None: def validate_datetime(datetime_value) -> None:
"""Validate ISO 8601 datetime format.""" """Validate datetime value (accepts datetime objects or ISO 8601 strings)."""
if not datetime_str: if datetime_value is None:
return return
# Already a datetime object - valid
from datetime import datetime as dt, date
if isinstance(datetime_value, (dt, date)):
return
# Must be a string - validate ISO 8601 format
if not isinstance(datetime_value, str):
raise InvalidDate(
f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. "
f"Got: {type(datetime_value).__name__}"
)
try: try:
# Try parsing as ISO 8601 datetime # Try parsing as ISO 8601 datetime
datetime.fromisoformat(datetime_str.replace('Z', '+00:00')) datetime.fromisoformat(datetime_value.replace('Z', '+00:00'))
except (ValueError, AttributeError): except (ValueError, AttributeError):
raise InvalidDate( raise InvalidDate(
f"Invalid datetime format: '{datetime_str}'. " f"Invalid datetime format: '{datetime_value}'. "
f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')." f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')."
) )
def validate_last_update_filters(updated_since: str | None, updated_in_past_hours: int | None) -> None:
"""Validate last_update_date filtering parameters."""
if updated_since and updated_in_past_hours:
raise ValueError(
"Cannot use both 'updated_since' and 'updated_in_past_hours' parameters together. "
"Please use only one method to filter by last_update_date."
)
# Validate updated_since format if provided
if updated_since:
validate_datetime(updated_since)
# Validate updated_in_past_hours range if provided
if updated_in_past_hours is not None:
if updated_in_past_hours < 1:
raise ValueError(
f"updated_in_past_hours must be at least 1. Got: {updated_in_past_hours}"
)
def validate_filters( def validate_filters(
beds_min: int | None = None, beds_min: int | None = None,
beds_max: int | None = None, beds_max: int | None = None,
@@ -282,3 +321,95 @@ def validate_sort(sort_by: str | None, sort_direction: str | None = "desc") -> N
f"Invalid sort_direction value: '{sort_direction}'. " f"Invalid sort_direction value: '{sort_direction}'. "
f"Valid options: {', '.join(valid_directions)}" f"Valid options: {', '.join(valid_directions)}"
) )
def convert_to_datetime_string(value) -> str | None:
"""
Convert datetime object or string to ISO 8601 string format.
Accepts:
- datetime.datetime objects
- datetime.date objects
- ISO 8601 strings (returned as-is)
- None (returns None)
Returns ISO 8601 formatted string or None.
"""
if value is None:
return None
# Already a string - return as-is
if isinstance(value, str):
return value
# datetime.datetime object
from datetime import datetime, date
if isinstance(value, datetime):
return value.isoformat()
# datetime.date object (convert to datetime at midnight)
if isinstance(value, date):
return datetime.combine(value, datetime.min.time()).isoformat()
raise ValueError(
f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. "
f"Got: {type(value).__name__}"
)
def extract_timedelta_hours(value) -> int | None:
"""
Extract hours from int or timedelta object.
Accepts:
- int (returned as-is)
- timedelta objects (converted to total hours)
- None (returns None)
Returns integer hours or None.
"""
if value is None:
return None
# Already an int - return as-is
if isinstance(value, int):
return value
# timedelta object - convert to hours
from datetime import timedelta
if isinstance(value, timedelta):
return int(value.total_seconds() / 3600)
raise ValueError(
f"Invalid past_hours value. Expected int or timedelta object. "
f"Got: {type(value).__name__}"
)
def extract_timedelta_days(value) -> int | None:
"""
Extract days from int or timedelta object.
Accepts:
- int (returned as-is)
- timedelta objects (converted to total days)
- None (returns None)
Returns integer days or None.
"""
if value is None:
return None
# Already an int - return as-is
if isinstance(value, int):
return value
# timedelta object - convert to days
from datetime import timedelta
if isinstance(value, timedelta):
return int(value.total_seconds() / 86400) # 86400 seconds in a day
raise ValueError(
f"Invalid past_days value. Expected int or timedelta object. "
f"Got: {type(value).__name__}"
)