mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-04 19:44:29 -08:00
Add last_update_date filtering and improve time interface DX
Part A: Add last_update_date filtering (client-side) - Add updated_since parameter (accepts datetime object or ISO string) - Add updated_in_past_hours parameter (accepts int or timedelta) - Implement _apply_last_update_date_filter() method for client-side filtering - Add mutual exclusion validation for updated_* parameters Part B: Improve time interface DX - Accept datetime/timedelta objects for datetime_from, datetime_to - Accept timedelta objects for past_hours, past_days - Add type conversion helper functions in utils.py - Improve validation error messages with specific examples - Update validate_datetime to accept datetime objects Helper functions added: - convert_to_datetime_string() - Converts datetime objects to ISO strings - extract_timedelta_hours() - Extracts hours from timedelta objects - extract_timedelta_days() - Extracts days from timedelta objects - validate_last_update_filters() - Validates last_update_date parameters All changes are backward compatible - existing string/int parameters still work. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,12 @@
|
|||||||
import warnings
|
import warnings
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from datetime import datetime, timedelta
|
||||||
from .core.scrapers import ScraperInput
|
from .core.scrapers import ScraperInput
|
||||||
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_offset, validate_datetime, validate_filters, validate_sort
|
from .utils import (
|
||||||
|
process_result, ordered_properties, validate_input, validate_dates, validate_limit,
|
||||||
|
validate_offset, validate_datetime, validate_filters, validate_sort, validate_last_update_filters,
|
||||||
|
convert_to_datetime_string, extract_timedelta_hours, extract_timedelta_days
|
||||||
|
)
|
||||||
from .core.scrapers.realtor import RealtorScraper
|
from .core.scrapers.realtor import RealtorScraper
|
||||||
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
|
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
|
||||||
from typing import Union, Optional, List
|
from typing import Union, Optional, List
|
||||||
@@ -13,7 +18,7 @@ def scrape_property(
|
|||||||
property_type: Optional[List[str]] = None,
|
property_type: Optional[List[str]] = None,
|
||||||
radius: float = None,
|
radius: float = None,
|
||||||
mls_only: bool = False,
|
mls_only: bool = False,
|
||||||
past_days: int = None,
|
past_days: int | timedelta = None,
|
||||||
proxy: str = None,
|
proxy: str = None,
|
||||||
date_from: str = None,
|
date_from: str = None,
|
||||||
date_to: str = None,
|
date_to: str = None,
|
||||||
@@ -23,9 +28,12 @@ def scrape_property(
|
|||||||
limit: int = 10000,
|
limit: int = 10000,
|
||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
# New date/time filtering parameters
|
# New date/time filtering parameters
|
||||||
past_hours: int = None,
|
past_hours: int | timedelta = None,
|
||||||
datetime_from: str = None,
|
datetime_from: datetime | str = None,
|
||||||
datetime_to: str = None,
|
datetime_to: datetime | str = None,
|
||||||
|
# New last_update_date filtering parameters
|
||||||
|
updated_since: datetime | str = None,
|
||||||
|
updated_in_past_hours: int | timedelta = None,
|
||||||
# New property filtering parameters
|
# New property filtering parameters
|
||||||
beds_min: int = None,
|
beds_min: int = None,
|
||||||
beds_max: int = None,
|
beds_max: int = None,
|
||||||
@@ -67,8 +75,10 @@ def scrape_property(
|
|||||||
:param offset: Starting position for pagination within the 10k limit (offset + limit cannot exceed 10,000). Use with limit to fetch results in chunks (e.g., offset=200, limit=200 fetches results 200-399). Should be a multiple of 200 (page size) for optimal performance. Default is 0. Note: Cannot be used to bypass the 10k API limit - use date ranges (date_from/date_to) to narrow searches and fetch more data.
|
:param offset: Starting position for pagination within the 10k limit (offset + limit cannot exceed 10,000). Use with limit to fetch results in chunks (e.g., offset=200, limit=200 fetches results 200-399). Should be a multiple of 200 (page size) for optimal performance. Default is 0. Note: Cannot be used to bypass the 10k API limit - use date ranges (date_from/date_to) to narrow searches and fetch more data.
|
||||||
|
|
||||||
New parameters:
|
New parameters:
|
||||||
:param past_hours: Get properties in the last _ hours (requires client-side filtering)
|
:param past_hours: Get properties in the last _ hours (requires client-side filtering). Accepts int or timedelta.
|
||||||
:param datetime_from, datetime_to: ISO 8601 datetime strings for precise time filtering (e.g. "2025-01-20T14:30:00")
|
:param datetime_from, datetime_to: Precise time filtering. Accepts datetime objects or ISO 8601 strings (e.g. "2025-01-20T14:30:00")
|
||||||
|
:param updated_since: Filter by last_update_date (when property was last updated). Accepts datetime object or ISO 8601 string (client-side filtering)
|
||||||
|
:param updated_in_past_hours: Filter by properties updated in the last _ hours. Accepts int or timedelta (client-side filtering)
|
||||||
:param beds_min, beds_max: Filter by number of bedrooms
|
:param beds_min, beds_max: Filter by number of bedrooms
|
||||||
:param baths_min, baths_max: Filter by number of bathrooms
|
:param baths_min, baths_max: Filter by number of bathrooms
|
||||||
:param sqft_min, sqft_max: Filter by square footage
|
:param sqft_min, sqft_max: Filter by square footage
|
||||||
@@ -77,6 +87,8 @@ def scrape_property(
|
|||||||
:param year_built_min, year_built_max: Filter by year built
|
:param year_built_min, year_built_max: Filter by year built
|
||||||
:param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
|
:param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
|
||||||
:param sort_direction: Sort direction (asc, desc)
|
:param sort_direction: Sort direction (asc, desc)
|
||||||
|
|
||||||
|
Note: past_days and past_hours also accept timedelta objects for more Pythonic usage.
|
||||||
"""
|
"""
|
||||||
validate_input(listing_type)
|
validate_input(listing_type)
|
||||||
validate_dates(date_from, date_to)
|
validate_dates(date_from, date_to)
|
||||||
@@ -90,6 +102,12 @@ def scrape_property(
|
|||||||
)
|
)
|
||||||
validate_sort(sort_by, sort_direction)
|
validate_sort(sort_by, sort_direction)
|
||||||
|
|
||||||
|
# Validate new last_update_date filtering parameters
|
||||||
|
validate_last_update_filters(
|
||||||
|
convert_to_datetime_string(updated_since),
|
||||||
|
extract_timedelta_hours(updated_in_past_hours)
|
||||||
|
)
|
||||||
|
|
||||||
# Convert listing_type to appropriate format
|
# Convert listing_type to appropriate format
|
||||||
if listing_type is None:
|
if listing_type is None:
|
||||||
converted_listing_type = None
|
converted_listing_type = None
|
||||||
@@ -98,6 +116,14 @@ def scrape_property(
|
|||||||
else:
|
else:
|
||||||
converted_listing_type = ListingType(listing_type.upper())
|
converted_listing_type = ListingType(listing_type.upper())
|
||||||
|
|
||||||
|
# Convert datetime/timedelta objects to appropriate formats
|
||||||
|
converted_past_days = extract_timedelta_days(past_days)
|
||||||
|
converted_past_hours = extract_timedelta_hours(past_hours)
|
||||||
|
converted_datetime_from = convert_to_datetime_string(datetime_from)
|
||||||
|
converted_datetime_to = convert_to_datetime_string(datetime_to)
|
||||||
|
converted_updated_since = convert_to_datetime_string(updated_since)
|
||||||
|
converted_updated_in_past_hours = extract_timedelta_hours(updated_in_past_hours)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
location=location,
|
location=location,
|
||||||
listing_type=converted_listing_type,
|
listing_type=converted_listing_type,
|
||||||
@@ -106,7 +132,7 @@ def scrape_property(
|
|||||||
proxy=proxy,
|
proxy=proxy,
|
||||||
radius=radius,
|
radius=radius,
|
||||||
mls_only=mls_only,
|
mls_only=mls_only,
|
||||||
last_x_days=past_days,
|
last_x_days=converted_past_days,
|
||||||
date_from=date_from,
|
date_from=date_from,
|
||||||
date_to=date_to,
|
date_to=date_to,
|
||||||
foreclosure=foreclosure,
|
foreclosure=foreclosure,
|
||||||
@@ -115,9 +141,12 @@ def scrape_property(
|
|||||||
limit=limit,
|
limit=limit,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
# New date/time filtering
|
# New date/time filtering
|
||||||
past_hours=past_hours,
|
past_hours=converted_past_hours,
|
||||||
datetime_from=datetime_from,
|
datetime_from=converted_datetime_from,
|
||||||
datetime_to=datetime_to,
|
datetime_to=converted_datetime_to,
|
||||||
|
# New last_update_date filtering
|
||||||
|
updated_since=converted_updated_since,
|
||||||
|
updated_in_past_hours=converted_updated_in_past_hours,
|
||||||
# New property filtering
|
# New property filtering
|
||||||
beds_min=beds_min,
|
beds_min=beds_min,
|
||||||
beds_max=beds_max,
|
beds_max=beds_max,
|
||||||
|
|||||||
@@ -33,6 +33,10 @@ class ScraperInput(BaseModel):
|
|||||||
datetime_from: str | None = None
|
datetime_from: str | None = None
|
||||||
datetime_to: str | None = None
|
datetime_to: str | None = None
|
||||||
|
|
||||||
|
# New last_update_date filtering parameters
|
||||||
|
updated_since: str | None = None
|
||||||
|
updated_in_past_hours: int | None = None
|
||||||
|
|
||||||
# New property filtering parameters
|
# New property filtering parameters
|
||||||
beds_min: int | None = None
|
beds_min: int | None = None
|
||||||
beds_max: int | None = None
|
beds_max: int | None = None
|
||||||
@@ -115,6 +119,10 @@ class Scraper:
|
|||||||
self.datetime_from = scraper_input.datetime_from
|
self.datetime_from = scraper_input.datetime_from
|
||||||
self.datetime_to = scraper_input.datetime_to
|
self.datetime_to = scraper_input.datetime_to
|
||||||
|
|
||||||
|
# New last_update_date filtering
|
||||||
|
self.updated_since = scraper_input.updated_since
|
||||||
|
self.updated_in_past_hours = scraper_input.updated_in_past_hours
|
||||||
|
|
||||||
# New property filtering
|
# New property filtering
|
||||||
self.beds_min = scraper_input.beds_min
|
self.beds_min = scraper_input.beds_min
|
||||||
self.beds_max = scraper_input.beds_max
|
self.beds_max = scraper_input.beds_max
|
||||||
|
|||||||
@@ -558,6 +558,10 @@ class RealtorScraper(Scraper):
|
|||||||
elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
|
elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
|
||||||
homes = self._apply_pending_date_filter(homes)
|
homes = self._apply_pending_date_filter(homes)
|
||||||
|
|
||||||
|
# Apply client-side filtering by last_update_date if specified
|
||||||
|
if self.updated_since or self.updated_in_past_hours:
|
||||||
|
homes = self._apply_last_update_date_filter(homes)
|
||||||
|
|
||||||
# Apply client-side sort to ensure results are properly ordered
|
# Apply client-side sort to ensure results are properly ordered
|
||||||
# This is necessary after filtering and to guarantee sort order across page boundaries
|
# This is necessary after filtering and to guarantee sort order across page boundaries
|
||||||
if self.sort_by:
|
if self.sort_by:
|
||||||
@@ -730,6 +734,50 @@ class RealtorScraper(Scraper):
|
|||||||
return getattr(home.flags, 'is_contingent', False)
|
return getattr(home.flags, 'is_contingent', False)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _apply_last_update_date_filter(self, homes):
|
||||||
|
"""Apply client-side filtering by last_update_date.
|
||||||
|
|
||||||
|
This is used when updated_since or updated_in_past_hours are specified.
|
||||||
|
Filters properties based on when they were last updated.
|
||||||
|
"""
|
||||||
|
if not homes:
|
||||||
|
return homes
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
# Determine date range for last_update_date filtering
|
||||||
|
date_range = None
|
||||||
|
|
||||||
|
if self.updated_in_past_hours:
|
||||||
|
cutoff_datetime = datetime.now() - timedelta(hours=self.updated_in_past_hours)
|
||||||
|
date_range = {'type': 'since', 'date': cutoff_datetime}
|
||||||
|
elif self.updated_since:
|
||||||
|
try:
|
||||||
|
since_datetime_str = self.updated_since.replace('Z', '+00:00') if self.updated_since.endswith('Z') else self.updated_since
|
||||||
|
since_datetime = datetime.fromisoformat(since_datetime_str).replace(tzinfo=None)
|
||||||
|
date_range = {'type': 'since', 'date': since_datetime}
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
return homes # If parsing fails, return unfiltered
|
||||||
|
|
||||||
|
if not date_range:
|
||||||
|
return homes
|
||||||
|
|
||||||
|
filtered_homes = []
|
||||||
|
|
||||||
|
for home in homes:
|
||||||
|
# Extract last_update_date from the property
|
||||||
|
property_date = self._extract_date_from_home(home, 'last_update_date')
|
||||||
|
|
||||||
|
# Skip properties without last_update_date
|
||||||
|
if property_date is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if property date falls within the specified range
|
||||||
|
if self._is_datetime_in_range(property_date, date_range):
|
||||||
|
filtered_homes.append(home)
|
||||||
|
|
||||||
|
return filtered_homes
|
||||||
|
|
||||||
def _get_date_range(self):
|
def _get_date_range(self):
|
||||||
"""Get the date range for filtering based on instance parameters."""
|
"""Get the date range for filtering based on instance parameters."""
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|||||||
@@ -172,7 +172,7 @@ def validate_input(listing_type: str | list[str] | None) -> None:
|
|||||||
|
|
||||||
def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
||||||
if isinstance(date_from, str) != isinstance(date_to, str):
|
if isinstance(date_from, str) != isinstance(date_to, str):
|
||||||
raise InvalidDate("Both date_from and date_to must be provided.")
|
raise InvalidDate("Both date_from and date_to must be provided together.")
|
||||||
|
|
||||||
if date_from and date_to:
|
if date_from and date_to:
|
||||||
try:
|
try:
|
||||||
@@ -180,9 +180,16 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
|||||||
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")
|
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")
|
||||||
|
|
||||||
if date_to_obj < date_from_obj:
|
if date_to_obj < date_from_obj:
|
||||||
raise InvalidDate("date_to must be after date_from.")
|
raise InvalidDate(f"date_to ('{date_to}') must be after date_from ('{date_from}').")
|
||||||
except ValueError:
|
except ValueError as e:
|
||||||
raise InvalidDate(f"Invalid date format or range")
|
# Provide specific guidance on the expected format
|
||||||
|
if "does not match format" in str(e):
|
||||||
|
raise InvalidDate(
|
||||||
|
f"Invalid date format. Expected 'YYYY-MM-DD' format. "
|
||||||
|
f"Examples: '2025-01-20', '2024-12-31'. "
|
||||||
|
f"Got: date_from='{date_from}', date_to='{date_to}'"
|
||||||
|
)
|
||||||
|
raise InvalidDate(f"Invalid date format or range: {e}")
|
||||||
|
|
||||||
|
|
||||||
def validate_limit(limit: int) -> None:
|
def validate_limit(limit: int) -> None:
|
||||||
@@ -222,21 +229,53 @@ def validate_offset(offset: int, limit: int = 10000) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def validate_datetime(datetime_str: str | None) -> None:
|
def validate_datetime(datetime_value) -> None:
|
||||||
"""Validate ISO 8601 datetime format."""
|
"""Validate datetime value (accepts datetime objects or ISO 8601 strings)."""
|
||||||
if not datetime_str:
|
if datetime_value is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Already a datetime object - valid
|
||||||
|
from datetime import datetime as dt, date
|
||||||
|
if isinstance(datetime_value, (dt, date)):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Must be a string - validate ISO 8601 format
|
||||||
|
if not isinstance(datetime_value, str):
|
||||||
|
raise InvalidDate(
|
||||||
|
f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. "
|
||||||
|
f"Got: {type(datetime_value).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try parsing as ISO 8601 datetime
|
# Try parsing as ISO 8601 datetime
|
||||||
datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
|
datetime.fromisoformat(datetime_value.replace('Z', '+00:00'))
|
||||||
except (ValueError, AttributeError):
|
except (ValueError, AttributeError):
|
||||||
raise InvalidDate(
|
raise InvalidDate(
|
||||||
f"Invalid datetime format: '{datetime_str}'. "
|
f"Invalid datetime format: '{datetime_value}'. "
|
||||||
f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')."
|
f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_last_update_filters(updated_since: str | None, updated_in_past_hours: int | None) -> None:
|
||||||
|
"""Validate last_update_date filtering parameters."""
|
||||||
|
if updated_since and updated_in_past_hours:
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot use both 'updated_since' and 'updated_in_past_hours' parameters together. "
|
||||||
|
"Please use only one method to filter by last_update_date."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate updated_since format if provided
|
||||||
|
if updated_since:
|
||||||
|
validate_datetime(updated_since)
|
||||||
|
|
||||||
|
# Validate updated_in_past_hours range if provided
|
||||||
|
if updated_in_past_hours is not None:
|
||||||
|
if updated_in_past_hours < 1:
|
||||||
|
raise ValueError(
|
||||||
|
f"updated_in_past_hours must be at least 1. Got: {updated_in_past_hours}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def validate_filters(
|
def validate_filters(
|
||||||
beds_min: int | None = None,
|
beds_min: int | None = None,
|
||||||
beds_max: int | None = None,
|
beds_max: int | None = None,
|
||||||
@@ -282,3 +321,95 @@ def validate_sort(sort_by: str | None, sort_direction: str | None = "desc") -> N
|
|||||||
f"Invalid sort_direction value: '{sort_direction}'. "
|
f"Invalid sort_direction value: '{sort_direction}'. "
|
||||||
f"Valid options: {', '.join(valid_directions)}"
|
f"Valid options: {', '.join(valid_directions)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_datetime_string(value) -> str | None:
|
||||||
|
"""
|
||||||
|
Convert datetime object or string to ISO 8601 string format.
|
||||||
|
|
||||||
|
Accepts:
|
||||||
|
- datetime.datetime objects
|
||||||
|
- datetime.date objects
|
||||||
|
- ISO 8601 strings (returned as-is)
|
||||||
|
- None (returns None)
|
||||||
|
|
||||||
|
Returns ISO 8601 formatted string or None.
|
||||||
|
"""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Already a string - return as-is
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value
|
||||||
|
|
||||||
|
# datetime.datetime object
|
||||||
|
from datetime import datetime, date
|
||||||
|
if isinstance(value, datetime):
|
||||||
|
return value.isoformat()
|
||||||
|
|
||||||
|
# datetime.date object (convert to datetime at midnight)
|
||||||
|
if isinstance(value, date):
|
||||||
|
return datetime.combine(value, datetime.min.time()).isoformat()
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. "
|
||||||
|
f"Got: {type(value).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_timedelta_hours(value) -> int | None:
|
||||||
|
"""
|
||||||
|
Extract hours from int or timedelta object.
|
||||||
|
|
||||||
|
Accepts:
|
||||||
|
- int (returned as-is)
|
||||||
|
- timedelta objects (converted to total hours)
|
||||||
|
- None (returns None)
|
||||||
|
|
||||||
|
Returns integer hours or None.
|
||||||
|
"""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Already an int - return as-is
|
||||||
|
if isinstance(value, int):
|
||||||
|
return value
|
||||||
|
|
||||||
|
# timedelta object - convert to hours
|
||||||
|
from datetime import timedelta
|
||||||
|
if isinstance(value, timedelta):
|
||||||
|
return int(value.total_seconds() / 3600)
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid past_hours value. Expected int or timedelta object. "
|
||||||
|
f"Got: {type(value).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_timedelta_days(value) -> int | None:
|
||||||
|
"""
|
||||||
|
Extract days from int or timedelta object.
|
||||||
|
|
||||||
|
Accepts:
|
||||||
|
- int (returned as-is)
|
||||||
|
- timedelta objects (converted to total days)
|
||||||
|
- None (returns None)
|
||||||
|
|
||||||
|
Returns integer days or None.
|
||||||
|
"""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Already an int - return as-is
|
||||||
|
if isinstance(value, int):
|
||||||
|
return value
|
||||||
|
|
||||||
|
# timedelta object - convert to days
|
||||||
|
from datetime import timedelta
|
||||||
|
if isinstance(value, timedelta):
|
||||||
|
return int(value.total_seconds() / 86400) # 86400 seconds in a day
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid past_days value. Expected int or timedelta object. "
|
||||||
|
f"Got: {type(value).__name__}"
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user