diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index b0599be..45721c6 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,7 +1,12 @@ import warnings import pandas as pd +from datetime import datetime, timedelta from .core.scrapers import ScraperInput -from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_offset, validate_datetime, validate_filters, validate_sort +from .utils import ( + process_result, ordered_properties, validate_input, validate_dates, validate_limit, + validate_offset, validate_datetime, validate_filters, validate_sort, validate_last_update_filters, + convert_to_datetime_string, extract_timedelta_hours, extract_timedelta_days +) from .core.scrapers.realtor import RealtorScraper from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property from typing import Union, Optional, List @@ -13,7 +18,7 @@ def scrape_property( property_type: Optional[List[str]] = None, radius: float = None, mls_only: bool = False, - past_days: int = None, + past_days: int | timedelta = None, proxy: str = None, date_from: str = None, date_to: str = None, @@ -23,9 +28,12 @@ def scrape_property( limit: int = 10000, offset: int = 0, # New date/time filtering parameters - past_hours: int = None, - datetime_from: str = None, - datetime_to: str = None, + past_hours: int | timedelta = None, + datetime_from: datetime | str = None, + datetime_to: datetime | str = None, + # New last_update_date filtering parameters + updated_since: datetime | str = None, + updated_in_past_hours: int | timedelta = None, # New property filtering parameters beds_min: int = None, beds_max: int = None, @@ -67,8 +75,10 @@ def scrape_property( :param offset: Starting position for pagination within the 10k limit (offset + limit cannot exceed 10,000). Use with limit to fetch results in chunks (e.g., offset=200, limit=200 fetches results 200-399). Should be a multiple of 200 (page size) for optimal performance. Default is 0. Note: Cannot be used to bypass the 10k API limit - use date ranges (date_from/date_to) to narrow searches and fetch more data. New parameters: - :param past_hours: Get properties in the last _ hours (requires client-side filtering) - :param datetime_from, datetime_to: ISO 8601 datetime strings for precise time filtering (e.g. "2025-01-20T14:30:00") + :param past_hours: Get properties in the last _ hours (requires client-side filtering). Accepts int or timedelta. + :param datetime_from, datetime_to: Precise time filtering. Accepts datetime objects or ISO 8601 strings (e.g. "2025-01-20T14:30:00") + :param updated_since: Filter by last_update_date (when property was last updated). Accepts datetime object or ISO 8601 string (client-side filtering) + :param updated_in_past_hours: Filter by properties updated in the last _ hours. Accepts int or timedelta (client-side filtering) :param beds_min, beds_max: Filter by number of bedrooms :param baths_min, baths_max: Filter by number of bathrooms :param sqft_min, sqft_max: Filter by square footage @@ -77,6 +87,8 @@ def scrape_property( :param year_built_min, year_built_max: Filter by year built :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date) :param sort_direction: Sort direction (asc, desc) + + Note: past_days and past_hours also accept timedelta objects for more Pythonic usage. """ validate_input(listing_type) validate_dates(date_from, date_to) @@ -90,6 +102,12 @@ def scrape_property( ) validate_sort(sort_by, sort_direction) + # Validate new last_update_date filtering parameters + validate_last_update_filters( + convert_to_datetime_string(updated_since), + extract_timedelta_hours(updated_in_past_hours) + ) + # Convert listing_type to appropriate format if listing_type is None: converted_listing_type = None @@ -98,6 +116,14 @@ def scrape_property( else: converted_listing_type = ListingType(listing_type.upper()) + # Convert datetime/timedelta objects to appropriate formats + converted_past_days = extract_timedelta_days(past_days) + converted_past_hours = extract_timedelta_hours(past_hours) + converted_datetime_from = convert_to_datetime_string(datetime_from) + converted_datetime_to = convert_to_datetime_string(datetime_to) + converted_updated_since = convert_to_datetime_string(updated_since) + converted_updated_in_past_hours = extract_timedelta_hours(updated_in_past_hours) + scraper_input = ScraperInput( location=location, listing_type=converted_listing_type, @@ -106,7 +132,7 @@ def scrape_property( proxy=proxy, radius=radius, mls_only=mls_only, - last_x_days=past_days, + last_x_days=converted_past_days, date_from=date_from, date_to=date_to, foreclosure=foreclosure, @@ -115,9 +141,12 @@ def scrape_property( limit=limit, offset=offset, # New date/time filtering - past_hours=past_hours, - datetime_from=datetime_from, - datetime_to=datetime_to, + past_hours=converted_past_hours, + datetime_from=converted_datetime_from, + datetime_to=converted_datetime_to, + # New last_update_date filtering + updated_since=converted_updated_since, + updated_in_past_hours=converted_updated_in_past_hours, # New property filtering beds_min=beds_min, beds_max=beds_max, diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index caeda2c..0a0b539 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -33,6 +33,10 @@ class ScraperInput(BaseModel): datetime_from: str | None = None datetime_to: str | None = None + # New last_update_date filtering parameters + updated_since: str | None = None + updated_in_past_hours: int | None = None + # New property filtering parameters beds_min: int | None = None beds_max: int | None = None @@ -115,6 +119,10 @@ class Scraper: self.datetime_from = scraper_input.datetime_from self.datetime_to = scraper_input.datetime_to + # New last_update_date filtering + self.updated_since = scraper_input.updated_since + self.updated_in_past_hours = scraper_input.updated_in_past_hours + # New property filtering self.beds_min = scraper_input.beds_min self.beds_max = scraper_input.beds_max diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index b3dbe8f..5a5dee3 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -558,6 +558,10 @@ class RealtorScraper(Scraper): elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from): homes = self._apply_pending_date_filter(homes) + # Apply client-side filtering by last_update_date if specified + if self.updated_since or self.updated_in_past_hours: + homes = self._apply_last_update_date_filter(homes) + # Apply client-side sort to ensure results are properly ordered # This is necessary after filtering and to guarantee sort order across page boundaries if self.sort_by: @@ -729,7 +733,51 @@ class RealtorScraper(Scraper): if hasattr(home, 'flags') and home.flags: return getattr(home.flags, 'is_contingent', False) return False - + + def _apply_last_update_date_filter(self, homes): + """Apply client-side filtering by last_update_date. + + This is used when updated_since or updated_in_past_hours are specified. + Filters properties based on when they were last updated. + """ + if not homes: + return homes + + from datetime import datetime, timedelta + + # Determine date range for last_update_date filtering + date_range = None + + if self.updated_in_past_hours: + cutoff_datetime = datetime.now() - timedelta(hours=self.updated_in_past_hours) + date_range = {'type': 'since', 'date': cutoff_datetime} + elif self.updated_since: + try: + since_datetime_str = self.updated_since.replace('Z', '+00:00') if self.updated_since.endswith('Z') else self.updated_since + since_datetime = datetime.fromisoformat(since_datetime_str).replace(tzinfo=None) + date_range = {'type': 'since', 'date': since_datetime} + except (ValueError, AttributeError): + return homes # If parsing fails, return unfiltered + + if not date_range: + return homes + + filtered_homes = [] + + for home in homes: + # Extract last_update_date from the property + property_date = self._extract_date_from_home(home, 'last_update_date') + + # Skip properties without last_update_date + if property_date is None: + continue + + # Check if property date falls within the specified range + if self._is_datetime_in_range(property_date, date_range): + filtered_homes.append(home) + + return filtered_homes + def _get_date_range(self): """Get the date range for filtering based on instance parameters.""" from datetime import datetime, timedelta diff --git a/homeharvest/utils.py b/homeharvest/utils.py index 2b0973c..58d67bd 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -172,7 +172,7 @@ def validate_input(listing_type: str | list[str] | None) -> None: def validate_dates(date_from: str | None, date_to: str | None) -> None: if isinstance(date_from, str) != isinstance(date_to, str): - raise InvalidDate("Both date_from and date_to must be provided.") + raise InvalidDate("Both date_from and date_to must be provided together.") if date_from and date_to: try: @@ -180,9 +180,16 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None: date_to_obj = datetime.strptime(date_to, "%Y-%m-%d") if date_to_obj < date_from_obj: - raise InvalidDate("date_to must be after date_from.") - except ValueError: - raise InvalidDate(f"Invalid date format or range") + raise InvalidDate(f"date_to ('{date_to}') must be after date_from ('{date_from}').") + except ValueError as e: + # Provide specific guidance on the expected format + if "does not match format" in str(e): + raise InvalidDate( + f"Invalid date format. Expected 'YYYY-MM-DD' format. " + f"Examples: '2025-01-20', '2024-12-31'. " + f"Got: date_from='{date_from}', date_to='{date_to}'" + ) + raise InvalidDate(f"Invalid date format or range: {e}") def validate_limit(limit: int) -> None: @@ -222,21 +229,53 @@ def validate_offset(offset: int, limit: int = 10000) -> None: ) -def validate_datetime(datetime_str: str | None) -> None: - """Validate ISO 8601 datetime format.""" - if not datetime_str: +def validate_datetime(datetime_value) -> None: + """Validate datetime value (accepts datetime objects or ISO 8601 strings).""" + if datetime_value is None: return + # Already a datetime object - valid + from datetime import datetime as dt, date + if isinstance(datetime_value, (dt, date)): + return + + # Must be a string - validate ISO 8601 format + if not isinstance(datetime_value, str): + raise InvalidDate( + f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. " + f"Got: {type(datetime_value).__name__}" + ) + try: # Try parsing as ISO 8601 datetime - datetime.fromisoformat(datetime_str.replace('Z', '+00:00')) + datetime.fromisoformat(datetime_value.replace('Z', '+00:00')) except (ValueError, AttributeError): raise InvalidDate( - f"Invalid datetime format: '{datetime_str}'. " + f"Invalid datetime format: '{datetime_value}'. " f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')." ) +def validate_last_update_filters(updated_since: str | None, updated_in_past_hours: int | None) -> None: + """Validate last_update_date filtering parameters.""" + if updated_since and updated_in_past_hours: + raise ValueError( + "Cannot use both 'updated_since' and 'updated_in_past_hours' parameters together. " + "Please use only one method to filter by last_update_date." + ) + + # Validate updated_since format if provided + if updated_since: + validate_datetime(updated_since) + + # Validate updated_in_past_hours range if provided + if updated_in_past_hours is not None: + if updated_in_past_hours < 1: + raise ValueError( + f"updated_in_past_hours must be at least 1. Got: {updated_in_past_hours}" + ) + + def validate_filters( beds_min: int | None = None, beds_max: int | None = None, @@ -282,3 +321,95 @@ def validate_sort(sort_by: str | None, sort_direction: str | None = "desc") -> N f"Invalid sort_direction value: '{sort_direction}'. " f"Valid options: {', '.join(valid_directions)}" ) + + +def convert_to_datetime_string(value) -> str | None: + """ + Convert datetime object or string to ISO 8601 string format. + + Accepts: + - datetime.datetime objects + - datetime.date objects + - ISO 8601 strings (returned as-is) + - None (returns None) + + Returns ISO 8601 formatted string or None. + """ + if value is None: + return None + + # Already a string - return as-is + if isinstance(value, str): + return value + + # datetime.datetime object + from datetime import datetime, date + if isinstance(value, datetime): + return value.isoformat() + + # datetime.date object (convert to datetime at midnight) + if isinstance(value, date): + return datetime.combine(value, datetime.min.time()).isoformat() + + raise ValueError( + f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. " + f"Got: {type(value).__name__}" + ) + + +def extract_timedelta_hours(value) -> int | None: + """ + Extract hours from int or timedelta object. + + Accepts: + - int (returned as-is) + - timedelta objects (converted to total hours) + - None (returns None) + + Returns integer hours or None. + """ + if value is None: + return None + + # Already an int - return as-is + if isinstance(value, int): + return value + + # timedelta object - convert to hours + from datetime import timedelta + if isinstance(value, timedelta): + return int(value.total_seconds() / 3600) + + raise ValueError( + f"Invalid past_hours value. Expected int or timedelta object. " + f"Got: {type(value).__name__}" + ) + + +def extract_timedelta_days(value) -> int | None: + """ + Extract days from int or timedelta object. + + Accepts: + - int (returned as-is) + - timedelta objects (converted to total days) + - None (returns None) + + Returns integer days or None. + """ + if value is None: + return None + + # Already an int - return as-is + if isinstance(value, int): + return value + + # timedelta object - convert to days + from datetime import timedelta + if isinstance(value, timedelta): + return int(value.total_seconds() / 86400) # 86400 seconds in a day + + raise ValueError( + f"Invalid past_days value. Expected int or timedelta object. " + f"Got: {type(value).__name__}" + )