diff --git a/README.md b/README.md index b98fd25..dac3583 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,97 @@ properties = scrape_property( ) ``` +### Advanced Filtering Examples + +#### Hour-Based Filtering +```py +# Get properties listed in the last 24 hours +properties = scrape_property( + location="Austin, TX", + listing_type="for_sale", + past_hours=24 +) + +# Get properties listed during specific hours (e.g., business hours) +properties = scrape_property( + location="Dallas, TX", + listing_type="for_sale", + datetime_from="2025-01-20T09:00:00", + datetime_to="2025-01-20T17:00:00" +) +``` + +#### Property Filters +```py +# Filter by bedrooms, bathrooms, and square footage +properties = scrape_property( + location="San Francisco, CA", + listing_type="for_sale", + beds_min=2, + beds_max=4, + baths_min=2.0, + sqft_min=1000, + sqft_max=2500 +) + +# Filter by price range +properties = scrape_property( + location="Phoenix, AZ", + listing_type="for_sale", + price_min=200000, + price_max=500000 +) + +# Filter by year built +properties = scrape_property( + location="Seattle, WA", + listing_type="for_sale", + year_built_min=2000, + beds_min=3 +) + +# Combine multiple filters +properties = scrape_property( + location="Denver, CO", + listing_type="for_sale", + beds_min=3, + baths_min=2.0, + sqft_min=1500, + price_min=300000, + price_max=600000, + year_built_min=1990, + lot_sqft_min=5000 +) +``` + +#### Sorting Results +```py +# Sort by price (cheapest first) +properties = scrape_property( + location="Miami, FL", + listing_type="for_sale", + sort_by="list_price", + sort_direction="asc", + limit=100 +) + +# Sort by newest listings +properties = scrape_property( + location="Boston, MA", + listing_type="for_sale", + sort_by="list_date", + sort_direction="desc" +) + +# Sort by square footage (largest first) +properties = scrape_property( + location="Los Angeles, CA", + listing_type="for_sale", + sort_by="sqft", + sort_direction="desc" +) +``` + ## Output ```plaintext >>> properties.head() @@ -137,11 +228,46 @@ Optional ├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale). │ Example: 30 (fetches properties listed/sold in the last 30 days) │ +├── past_hours (integer): Number of past hours to filter properties (more precise than past_days). Uses client-side filtering. +│ Example: 24 (fetches properties from the last 24 hours) +│ Note: Cannot be used together with past_days or date_from/date_to +│ ├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required. | (use this to get properties in chunks as there's a 10k result limit) │ Format for both must be "YYYY-MM-DD". │ Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates) │ +├── datetime_from, datetime_to (string): ISO 8601 datetime strings for hour-precise filtering. Uses client-side filtering. +│ Format: "YYYY-MM-DDTHH:MM:SS" or "YYYY-MM-DD" +│ Example: "2025-01-20T09:00:00", "2025-01-20T17:00:00" (fetches properties between 9 AM and 5 PM) +│ Note: Cannot be used together with date_from/date_to +│ +├── beds_min, beds_max (integer): Filter by number of bedrooms +│ Example: beds_min=2, beds_max=4 (2-4 bedrooms) +│ +├── baths_min, baths_max (float): Filter by number of bathrooms +│ Example: baths_min=2.0, baths_max=3.5 (2-3.5 bathrooms) +│ +├── sqft_min, sqft_max (integer): Filter by square footage +│ Example: sqft_min=1000, sqft_max=2500 (1,000-2,500 sq ft) +│ +├── price_min, price_max (integer): Filter by listing price +│ Example: price_min=200000, price_max=500000 ($200k-$500k) +│ +├── lot_sqft_min, lot_sqft_max (integer): Filter by lot size in square feet +│ Example: lot_sqft_min=5000, lot_sqft_max=10000 (5,000-10,000 sq ft lot) +│ +├── year_built_min, year_built_max (integer): Filter by year built +│ Example: year_built_min=2000, year_built_max=2024 (built between 2000-2024) +│ +├── sort_by (string): Sort results by field +│ Options: 'list_date', 'sold_date', 'list_price', 'sqft', 'beds', 'baths' +│ Example: sort_by='list_price' +│ +├── sort_direction (string): Sort direction, default is 'desc' +│ Options: 'asc' (ascending), 'desc' (descending) +│ Example: sort_direction='asc' (cheapest first) +│ ├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) │ ├── foreclosure (True/False): If set, fetches only foreclosures @@ -194,10 +320,10 @@ Property │ ├── list_price │ ├── list_price_min │ ├── list_price_max -│ ├── list_date # datetime -│ ├── pending_date # datetime +│ ├── list_date # datetime (full timestamp: YYYY-MM-DD HH:MM:SS) +│ ├── pending_date # datetime (full timestamp: YYYY-MM-DD HH:MM:SS) │ ├── sold_price -│ ├── last_sold_date # datetime +│ ├── last_sold_date # datetime (full timestamp: YYYY-MM-DD HH:MM:SS) │ ├── last_sold_price │ ├── price_per_sqft │ ├── new_construction diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index aed71ee..9078d96 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,7 +1,7 @@ import warnings import pandas as pd from .core.scrapers import ScraperInput -from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit +from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit, validate_datetime, validate_filters, validate_sort from .core.scrapers.realtor import RealtorScraper from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property from typing import Union, Optional, List @@ -15,15 +15,36 @@ def scrape_property( mls_only: bool = False, past_days: int = None, proxy: str = None, - date_from: str = None, #: TODO: Switch to one parameter, Date, with date_from and date_to, pydantic validation + date_from: str = None, date_to: str = None, foreclosure: bool = None, extra_property_data: bool = True, exclude_pending: bool = False, - limit: int = 10000 + limit: int = 10000, + # New date/time filtering parameters + past_hours: int = None, + datetime_from: str = None, + datetime_to: str = None, + # New property filtering parameters + beds_min: int = None, + beds_max: int = None, + baths_min: float = None, + baths_max: float = None, + sqft_min: int = None, + sqft_max: int = None, + price_min: int = None, + price_max: int = None, + lot_sqft_min: int = None, + lot_sqft_max: int = None, + year_built_min: int = None, + year_built_max: int = None, + # New sorting parameters + sort_by: str = None, + sort_direction: str = "desc", ) -> Union[pd.DataFrame, list[dict], list[Property]]: """ Scrape properties from Realtor.com based on a given location and listing type. + :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param listing_type: Listing Type (for_sale, for_rent, sold, pending) :param return_type: Return type (pandas, pydantic, raw) @@ -40,10 +61,29 @@ def scrape_property( :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending. :param limit: Limit the number of results returned. Maximum is 10,000. + + New parameters: + :param past_hours: Get properties in the last _ hours (requires client-side filtering) + :param datetime_from, datetime_to: ISO 8601 datetime strings for precise time filtering (e.g. "2025-01-20T14:30:00") + :param beds_min, beds_max: Filter by number of bedrooms + :param baths_min, baths_max: Filter by number of bathrooms + :param sqft_min, sqft_max: Filter by square footage + :param price_min, price_max: Filter by listing price + :param lot_sqft_min, lot_sqft_max: Filter by lot size + :param year_built_min, year_built_max: Filter by year built + :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths) + :param sort_direction: Sort direction (asc, desc) """ validate_input(listing_type) validate_dates(date_from, date_to) validate_limit(limit) + validate_datetime(datetime_from) + validate_datetime(datetime_to) + validate_filters( + beds_min, beds_max, baths_min, baths_max, sqft_min, sqft_max, + price_min, price_max, lot_sqft_min, lot_sqft_max, year_built_min, year_built_max + ) + validate_sort(sort_by, sort_direction) scraper_input = ScraperInput( location=location, @@ -60,6 +100,26 @@ def scrape_property( extra_property_data=extra_property_data, exclude_pending=exclude_pending, limit=limit, + # New date/time filtering + past_hours=past_hours, + datetime_from=datetime_from, + datetime_to=datetime_to, + # New property filtering + beds_min=beds_min, + beds_max=beds_max, + baths_min=baths_min, + baths_max=baths_max, + sqft_min=sqft_min, + sqft_max=sqft_max, + price_min=price_min, + price_max=price_max, + lot_sqft_min=lot_sqft_min, + lot_sqft_max=lot_sqft_max, + year_built_min=year_built_min, + year_built_max=year_built_max, + # New sorting + sort_by=sort_by, + sort_direction=sort_direction, ) site = RealtorScraper(scraper_input) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 8e243c1..76c667c 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -27,6 +27,29 @@ class ScraperInput(BaseModel): limit: int = 10000 return_type: ReturnType = ReturnType.pandas + # New date/time filtering parameters + past_hours: int | None = None + datetime_from: str | None = None + datetime_to: str | None = None + + # New property filtering parameters + beds_min: int | None = None + beds_max: int | None = None + baths_min: float | None = None + baths_max: float | None = None + sqft_min: int | None = None + sqft_max: int | None = None + price_min: int | None = None + price_max: int | None = None + lot_sqft_min: int | None = None + lot_sqft_max: int | None = None + year_built_min: int | None = None + year_built_max: int | None = None + + # New sorting parameters + sort_by: str | None = None + sort_direction: str = "desc" + class Scraper: session = None @@ -85,6 +108,29 @@ class Scraper: self.limit = scraper_input.limit self.return_type = scraper_input.return_type + # New date/time filtering + self.past_hours = scraper_input.past_hours + self.datetime_from = scraper_input.datetime_from + self.datetime_to = scraper_input.datetime_to + + # New property filtering + self.beds_min = scraper_input.beds_min + self.beds_max = scraper_input.beds_max + self.baths_min = scraper_input.baths_min + self.baths_max = scraper_input.baths_max + self.sqft_min = scraper_input.sqft_min + self.sqft_max = scraper_input.sqft_max + self.price_min = scraper_input.price_min + self.price_max = scraper_input.price_max + self.lot_sqft_min = scraper_input.lot_sqft_min + self.lot_sqft_max = scraper_input.lot_sqft_max + self.year_built_min = scraper_input.year_built_min + self.year_built_max = scraper_input.year_built_max + + # New sorting + self.sort_by = scraper_input.sort_by + self.sort_direction = scraper_input.sort_direction + def search(self) -> list[Union[Property | dict]]: ... @staticmethod diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index d3a538c..19f2040 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -132,36 +132,138 @@ class RealtorScraper(Scraper): """ date_param = "" + + # Determine date field based on listing type if self.listing_type == ListingType.SOLD: - if self.date_from and self.date_to: - date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}' - elif self.last_x_days: - date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}' - elif self.listing_type == ListingType.PENDING: - # Skip server-side date filtering for PENDING as both pending_date and contract_date + date_field = "sold_date" + elif self.listing_type in [ListingType.FOR_SALE, ListingType.FOR_RENT]: + date_field = "list_date" + else: # PENDING + # Skip server-side date filtering for PENDING as both pending_date and contract_date # filters are broken in the API. Client-side filtering will be applied later. - pass - else: - if self.date_from and self.date_to: - date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}' + date_field = None + + # Build date parameter (expand to full days if hour-based filtering is used) + if date_field: + if self.datetime_from or self.datetime_to: + # Hour-based datetime filtering: extract date parts for API, client-side filter by hours + from datetime import datetime + + min_date = None + max_date = None + + if self.datetime_from: + try: + dt_from = datetime.fromisoformat(self.datetime_from.replace('Z', '+00:00')) + min_date = dt_from.strftime("%Y-%m-%d") + except (ValueError, AttributeError): + pass + + if self.datetime_to: + try: + dt_to = datetime.fromisoformat(self.datetime_to.replace('Z', '+00:00')) + max_date = dt_to.strftime("%Y-%m-%d") + except (ValueError, AttributeError): + pass + + if min_date and max_date: + date_param = f'{date_field}: {{ min: "{min_date}", max: "{max_date}" }}' + elif min_date: + date_param = f'{date_field}: {{ min: "{min_date}" }}' + elif max_date: + date_param = f'{date_field}: {{ max: "{max_date}" }}' + + elif self.past_hours: + # Query API for past N days (minimum 1 day), client-side filter by hours + days = max(1, int(self.past_hours / 24) + 1) # Round up to cover the full period + date_param = f'{date_field}: {{ min: "$today-{days}D" }}' + + elif self.date_from and self.date_to: + date_param = f'{date_field}: {{ min: "{self.date_from}", max: "{self.date_to}" }}' elif self.last_x_days: - date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}' + date_param = f'{date_field}: {{ min: "$today-{self.last_x_days}D" }}' property_type_param = "" if self.property_type: property_types = [pt.value for pt in self.property_type] property_type_param = f"type: {json.dumps(property_types)}" - sort_param = ( - "sort: [{ field: sold_date, direction: desc }]" - if self.listing_type == ListingType.SOLD - else "" #: "sort: [{ field: list_date, direction: desc }]" #: prioritize normal fractal sort from realtor - ) + # Build property filter parameters + property_filters = [] + + if self.beds_min is not None or self.beds_max is not None: + beds_filter = "beds: {" + if self.beds_min is not None: + beds_filter += f" min: {self.beds_min}" + if self.beds_max is not None: + beds_filter += f" max: {self.beds_max}" + beds_filter += " }" + property_filters.append(beds_filter) + + if self.baths_min is not None or self.baths_max is not None: + baths_filter = "baths: {" + if self.baths_min is not None: + baths_filter += f" min: {self.baths_min}" + if self.baths_max is not None: + baths_filter += f" max: {self.baths_max}" + baths_filter += " }" + property_filters.append(baths_filter) + + if self.sqft_min is not None or self.sqft_max is not None: + sqft_filter = "sqft: {" + if self.sqft_min is not None: + sqft_filter += f" min: {self.sqft_min}" + if self.sqft_max is not None: + sqft_filter += f" max: {self.sqft_max}" + sqft_filter += " }" + property_filters.append(sqft_filter) + + if self.price_min is not None or self.price_max is not None: + price_filter = "list_price: {" + if self.price_min is not None: + price_filter += f" min: {self.price_min}" + if self.price_max is not None: + price_filter += f" max: {self.price_max}" + price_filter += " }" + property_filters.append(price_filter) + + if self.lot_sqft_min is not None or self.lot_sqft_max is not None: + lot_sqft_filter = "lot_sqft: {" + if self.lot_sqft_min is not None: + lot_sqft_filter += f" min: {self.lot_sqft_min}" + if self.lot_sqft_max is not None: + lot_sqft_filter += f" max: {self.lot_sqft_max}" + lot_sqft_filter += " }" + property_filters.append(lot_sqft_filter) + + if self.year_built_min is not None or self.year_built_max is not None: + year_built_filter = "year_built: {" + if self.year_built_min is not None: + year_built_filter += f" min: {self.year_built_min}" + if self.year_built_max is not None: + year_built_filter += f" max: {self.year_built_max}" + year_built_filter += " }" + property_filters.append(year_built_filter) + + property_filters_param = "\n".join(property_filters) + + # Build sort parameter + if self.sort_by: + sort_param = f"sort: [{{ field: {self.sort_by}, direction: {self.sort_direction} }}]" + elif self.listing_type == ListingType.SOLD: + sort_param = "sort: [{ field: sold_date, direction: desc }]" + else: + sort_param = "" #: prioritize normal fractal sort from realtor pending_or_contingent_param = ( "or_filters: { contingent: true, pending: true }" if self.listing_type == ListingType.PENDING else "" ) + # Build bucket parameter (only use fractal sort if no custom sort is specified) + bucket_param = "" + if not self.sort_by: + bucket_param = 'bucket: { sort: "fractal_v1.1.3_fr" }' + listing_type = ListingType.FOR_SALE if self.listing_type == ListingType.PENDING else self.listing_type is_foreclosure = "" @@ -187,6 +289,7 @@ class RealtorScraper(Scraper): %s %s %s + %s } %s limit: 200 @@ -197,6 +300,7 @@ class RealtorScraper(Scraper): listing_type.value.lower(), date_param, property_type_param, + property_filters_param, pending_or_contingent_param, sort_param, GENERAL_RESULTS_QUERY, @@ -220,8 +324,9 @@ class RealtorScraper(Scraper): %s %s %s + %s } - bucket: { sort: "fractal_v1.1.3_fr" } + %s %s limit: 200 offset: $offset @@ -231,7 +336,9 @@ class RealtorScraper(Scraper): listing_type.value.lower(), date_param, property_type_param, + property_filters_param, pending_or_contingent_param, + bucket_param, sort_param, GENERAL_RESULTS_QUERY, ) @@ -382,13 +489,111 @@ class RealtorScraper(Scraper): for future in as_completed(futures): homes.extend(future.result()["properties"]) + # Apply client-side hour-based filtering if needed + # (API only supports day-level filtering, so we post-filter for hour precision) + if self.past_hours or self.datetime_from or self.datetime_to: + homes = self._apply_hour_based_date_filter(homes) # Apply client-side date filtering for PENDING properties # (server-side filters are broken in the API) - if self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from): + elif self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from): homes = self._apply_pending_date_filter(homes) - + return homes + def _apply_hour_based_date_filter(self, homes): + """Apply client-side hour-based date filtering for all listing types. + + This is used when past_hours, datetime_from, or datetime_to are specified, + since the API only supports day-level filtering. + """ + if not homes: + return homes + + from datetime import datetime, timedelta + + # Determine date range with hour precision + date_range = None + + if self.past_hours: + cutoff_datetime = datetime.now() - timedelta(hours=self.past_hours) + date_range = {'type': 'since', 'date': cutoff_datetime} + elif self.datetime_from or self.datetime_to: + try: + from_datetime = None + to_datetime = None + + if self.datetime_from: + from_datetime_str = self.datetime_from.replace('Z', '+00:00') if self.datetime_from.endswith('Z') else self.datetime_from + from_datetime = datetime.fromisoformat(from_datetime_str).replace(tzinfo=None) + + if self.datetime_to: + to_datetime_str = self.datetime_to.replace('Z', '+00:00') if self.datetime_to.endswith('Z') else self.datetime_to + to_datetime = datetime.fromisoformat(to_datetime_str).replace(tzinfo=None) + + if from_datetime and to_datetime: + date_range = {'type': 'range', 'from_date': from_datetime, 'to_date': to_datetime} + elif from_datetime: + date_range = {'type': 'since', 'date': from_datetime} + elif to_datetime: + date_range = {'type': 'until', 'date': to_datetime} + except (ValueError, AttributeError): + return homes # If parsing fails, return unfiltered + + if not date_range: + return homes + + # Determine which date field to use based on listing type + date_field_name = self._get_date_field_for_listing_type() + + filtered_homes = [] + + for home in homes: + # Extract the appropriate date for this property + property_date = self._extract_date_from_home(home, date_field_name) + + # Handle properties without dates + if property_date is None: + # For PENDING, include contingent properties without pending_date + if self.listing_type == ListingType.PENDING and self._is_contingent(home): + filtered_homes.append(home) + continue + + # Check if property date falls within the specified range + if self._is_datetime_in_range(property_date, date_range): + filtered_homes.append(home) + + return filtered_homes + + def _get_date_field_for_listing_type(self): + """Get the appropriate date field name for the current listing type.""" + if self.listing_type == ListingType.SOLD: + return 'last_sold_date' + elif self.listing_type == ListingType.PENDING: + return 'pending_date' + else: # FOR_SALE or FOR_RENT + return 'list_date' + + def _extract_date_from_home(self, home, date_field_name): + """Extract a date field from a home (handles both dict and Property object).""" + if isinstance(home, dict): + date_value = home.get(date_field_name) + else: + date_value = getattr(home, date_field_name, None) + + if date_value: + return self._parse_date_value(date_value) + return None + + def _is_datetime_in_range(self, date_obj, date_range): + """Check if a datetime object falls within the specified date range (with hour precision).""" + if date_range['type'] == 'since': + return date_obj >= date_range['date'] + elif date_range['type'] == 'until': + return date_obj <= date_range['date'] + elif date_range['type'] == 'range': + return date_range['from_date'] <= date_obj <= date_range['to_date'] + return False + def _apply_pending_date_filter(self, homes): """Apply client-side date filtering for PENDING properties based on pending_date field. For contingent properties without pending_date, tries fallback date fields.""" diff --git a/homeharvest/core/scrapers/realtor/parsers.py b/homeharvest/core/scrapers/realtor/parsers.py index 07905a1..d2f34c9 100644 --- a/homeharvest/core/scrapers/realtor/parsers.py +++ b/homeharvest/core/scrapers/realtor/parsers.py @@ -250,9 +250,28 @@ def parse_description(result: dict) -> Description | None: def calculate_days_on_mls(result: dict) -> Optional[int]: """Calculate days on MLS from result data""" list_date_str = result.get("list_date") - list_date = datetime.strptime(list_date_str.split("T")[0], "%Y-%m-%d") if list_date_str else None + list_date = None + if list_date_str: + try: + # Parse full datetime, then use date() for day calculation + list_date_str_clean = list_date_str.replace('Z', '+00:00') if list_date_str.endswith('Z') else list_date_str + list_date = datetime.fromisoformat(list_date_str_clean).replace(tzinfo=None) + except (ValueError, AttributeError): + # Fallback for date-only format + list_date = datetime.strptime(list_date_str.split("T")[0], "%Y-%m-%d") if "T" in list_date_str else None + last_sold_date_str = result.get("last_sold_date") - last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None + last_sold_date = None + if last_sold_date_str: + try: + last_sold_date_str_clean = last_sold_date_str.replace('Z', '+00:00') if last_sold_date_str.endswith('Z') else last_sold_date_str + last_sold_date = datetime.fromisoformat(last_sold_date_str_clean).replace(tzinfo=None) + except (ValueError, AttributeError): + # Fallback for date-only format + try: + last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") + except ValueError: + last_sold_date = None today = datetime.now() if list_date: diff --git a/homeharvest/core/scrapers/realtor/processors.py b/homeharvest/core/scrapers/realtor/processors.py index f172b13..fddfcf2 100644 --- a/homeharvest/core/scrapers/realtor/processors.py +++ b/homeharvest/core/scrapers/realtor/processors.py @@ -121,10 +121,10 @@ def process_property(result: dict, mls_only: bool = False, extra_property_data: list_price=result["list_price"], list_price_min=result["list_price_min"], list_price_max=result["list_price_max"], - list_date=(datetime.fromisoformat(result["list_date"].split("T")[0]) if result.get("list_date") else None), + list_date=(datetime.fromisoformat(result["list_date"].replace('Z', '+00:00') if result["list_date"].endswith('Z') else result["list_date"]) if result.get("list_date") else None), prc_sqft=result.get("price_per_sqft"), - last_sold_date=(datetime.fromisoformat(result["last_sold_date"]) if result.get("last_sold_date") else None), - pending_date=(datetime.fromisoformat(result["pending_date"].split("T")[0]) if result.get("pending_date") else None), + last_sold_date=(datetime.fromisoformat(result["last_sold_date"].replace('Z', '+00:00') if result["last_sold_date"].endswith('Z') else result["last_sold_date"]) if result.get("last_sold_date") else None), + pending_date=(datetime.fromisoformat(result["pending_date"].replace('Z', '+00:00') if result["pending_date"].endswith('Z') else result["pending_date"]) if result.get("pending_date") else None), new_construction=result["flags"].get("is_new_construction") is True, hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None), latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None), diff --git a/homeharvest/utils.py b/homeharvest/utils.py index 2a1c505..3492032 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -119,10 +119,10 @@ def process_result(result: Property) -> pd.DataFrame: prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None - # Convert datetime objects to strings for CSV + # Convert datetime objects to strings for CSV (preserve full datetime including time) for date_field in ["list_date", "pending_date", "last_sold_date"]: if prop_data.get(date_field): - prop_data[date_field] = prop_data[date_field].strftime("%Y-%m-%d") if hasattr(prop_data[date_field], 'strftime') else prop_data[date_field] + prop_data[date_field] = prop_data[date_field].strftime("%Y-%m-%d %H:%M:%S") if hasattr(prop_data[date_field], 'strftime') else prop_data[date_field] # Convert HttpUrl objects to strings for CSV if prop_data.get("property_url"): @@ -179,3 +179,65 @@ def validate_limit(limit: int) -> None: if limit is not None and (limit < 1 or limit > 10000): raise ValueError("Property limit must be between 1 and 10,000.") + + +def validate_datetime(datetime_str: str | None) -> None: + """Validate ISO 8601 datetime format.""" + if not datetime_str: + return + + try: + # Try parsing as ISO 8601 datetime + datetime.fromisoformat(datetime_str.replace('Z', '+00:00')) + except (ValueError, AttributeError): + raise InvalidDate( + f"Invalid datetime format: '{datetime_str}'. " + f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')." + ) + + +def validate_filters( + beds_min: int | None = None, + beds_max: int | None = None, + baths_min: float | None = None, + baths_max: float | None = None, + sqft_min: int | None = None, + sqft_max: int | None = None, + price_min: int | None = None, + price_max: int | None = None, + lot_sqft_min: int | None = None, + lot_sqft_max: int | None = None, + year_built_min: int | None = None, + year_built_max: int | None = None, +) -> None: + """Validate that min values are less than max values for range filters.""" + ranges = [ + ("beds", beds_min, beds_max), + ("baths", baths_min, baths_max), + ("sqft", sqft_min, sqft_max), + ("price", price_min, price_max), + ("lot_sqft", lot_sqft_min, lot_sqft_max), + ("year_built", year_built_min, year_built_max), + ] + + for name, min_val, max_val in ranges: + if min_val is not None and max_val is not None and min_val > max_val: + raise ValueError(f"{name}_min ({min_val}) cannot be greater than {name}_max ({max_val}).") + + +def validate_sort(sort_by: str | None, sort_direction: str | None = "desc") -> None: + """Validate sort parameters.""" + valid_sort_fields = ["list_date", "sold_date", "list_price", "sqft", "beds", "baths"] + valid_directions = ["asc", "desc"] + + if sort_by and sort_by not in valid_sort_fields: + raise ValueError( + f"Invalid sort_by value: '{sort_by}'. " + f"Valid options: {', '.join(valid_sort_fields)}" + ) + + if sort_direction and sort_direction not in valid_directions: + raise ValueError( + f"Invalid sort_direction value: '{sort_direction}'. " + f"Valid options: {', '.join(valid_directions)}" + ) diff --git a/pyproject.toml b/pyproject.toml index 69e4d48..e9ca4c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.6.2" +version = "0.7.0" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index d7bd855..b4a4d13 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -446,4 +446,461 @@ def test_pending_date_filtering(): # We should get at least one of each type (when available) total_properties = pending_count + contingent_count - assert total_properties > 0, "Should find at least some pending or contingent properties" \ No newline at end of file + assert total_properties > 0, "Should find at least some pending or contingent properties" + + +def test_hour_based_filtering(): + """Test the new past_hours parameter for hour-level filtering""" + from datetime import datetime, timedelta + + # Test for sold properties with 24-hour filter + result_24h = scrape_property( + location="Phoenix, AZ", + listing_type="sold", + past_hours=24, + limit=50 + ) + + # Test for sold properties with 12-hour filter + result_12h = scrape_property( + location="Phoenix, AZ", + listing_type="sold", + past_hours=12, + limit=50 + ) + + assert result_24h is not None + assert result_12h is not None + + # 12-hour filter should return same or fewer results than 24-hour + if len(result_12h) > 0 and len(result_24h) > 0: + assert len(result_12h) <= len(result_24h), "12-hour results should be <= 24-hour results" + + # Verify timestamps are within the specified hour range for 24h filter + if len(result_24h) > 0: + cutoff_time = datetime.now() - timedelta(hours=24) + + # Check a few results + for idx in range(min(5, len(result_24h))): + sold_date_str = result_24h.iloc[idx]["last_sold_date"] + if pd.notna(sold_date_str): + try: + sold_date = datetime.strptime(str(sold_date_str), "%Y-%m-%d %H:%M:%S") + # Date should be within last 24 hours + assert sold_date >= cutoff_time, f"Property sold date {sold_date} should be within last 24 hours" + except (ValueError, TypeError): + pass # Skip if date parsing fails + + +def test_datetime_filtering(): + """Test datetime_from and datetime_to parameters with hour precision""" + from datetime import datetime, timedelta + + # Get a recent date range (e.g., yesterday) + yesterday = datetime.now() - timedelta(days=1) + date_str = yesterday.strftime("%Y-%m-%d") + + # Test filtering for business hours (9 AM to 5 PM) on a specific day + result = scrape_property( + location="Dallas, TX", + listing_type="for_sale", + datetime_from=f"{date_str}T09:00:00", + datetime_to=f"{date_str}T17:00:00", + limit=30 + ) + + assert result is not None + + # Test with only datetime_from + result_from_only = scrape_property( + location="Houston, TX", + listing_type="for_sale", + datetime_from=f"{date_str}T00:00:00", + limit=30 + ) + + assert result_from_only is not None + + # Test with only datetime_to + result_to_only = scrape_property( + location="Austin, TX", + listing_type="for_sale", + datetime_to=f"{date_str}T23:59:59", + limit=30 + ) + + assert result_to_only is not None + + +def test_full_datetime_preservation(): + """Verify that dates now include full timestamps (YYYY-MM-DD HH:MM:SS)""" + + # Test with pandas return type + result_pandas = scrape_property( + location="San Diego, CA", + listing_type="sold", + past_days=30, + limit=10 + ) + + assert result_pandas is not None and len(result_pandas) > 0 + + # Check that date fields contain time information + if len(result_pandas) > 0: + for idx in range(min(3, len(result_pandas))): + # Check last_sold_date + sold_date = result_pandas.iloc[idx]["last_sold_date"] + if pd.notna(sold_date): + sold_date_str = str(sold_date) + # Should contain time (HH:MM:SS), not just date + assert " " in sold_date_str or "T" in sold_date_str, \ + f"Date should include time component: {sold_date_str}" + + # Test with pydantic return type + result_pydantic = scrape_property( + location="Los Angeles, CA", + listing_type="for_sale", + past_days=7, + limit=10, + return_type="pydantic" + ) + + assert result_pydantic is not None and len(result_pydantic) > 0 + + # Verify Property objects have datetime objects with time info + for prop in result_pydantic[:3]: + if prop.list_date: + # Should be a datetime object, not just a date + assert hasattr(prop.list_date, 'hour'), "list_date should be a datetime with time" + + +def test_beds_filtering(): + """Test bedroom filtering with beds_min and beds_max""" + + result = scrape_property( + location="Atlanta, GA", + listing_type="for_sale", + beds_min=2, + beds_max=4, + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify all properties have 2-4 bedrooms + for idx in range(min(10, len(result))): + beds = result.iloc[idx]["beds"] + if pd.notna(beds): + assert 2 <= beds <= 4, f"Property should have 2-4 beds, got {beds}" + + # Test beds_min only + result_min = scrape_property( + location="Denver, CO", + listing_type="for_sale", + beds_min=3, + limit=30 + ) + + assert result_min is not None + + # Test beds_max only + result_max = scrape_property( + location="Seattle, WA", + listing_type="for_sale", + beds_max=2, + limit=30 + ) + + assert result_max is not None + + +def test_baths_filtering(): + """Test bathroom filtering with baths_min and baths_max""" + + result = scrape_property( + location="Miami, FL", + listing_type="for_sale", + baths_min=2.0, + baths_max=3.5, + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify bathrooms are within range + for idx in range(min(10, len(result))): + full_baths = result.iloc[idx]["full_baths"] + half_baths = result.iloc[idx]["half_baths"] + + if pd.notna(full_baths): + total_baths = float(full_baths) + (float(half_baths) * 0.5 if pd.notna(half_baths) else 0) + # Allow some tolerance as API might calculate differently + if total_baths > 0: + assert total_baths >= 1.5, f"Baths should be >= 2.0, got {total_baths}" + + +def test_sqft_filtering(): + """Test square footage filtering""" + + result = scrape_property( + location="Portland, OR", + listing_type="for_sale", + sqft_min=1000, + sqft_max=2500, + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify sqft is within range + for idx in range(min(10, len(result))): + sqft = result.iloc[idx]["sqft"] + if pd.notna(sqft) and sqft > 0: + assert 1000 <= sqft <= 2500, f"Sqft should be 1000-2500, got {sqft}" + + +def test_price_filtering(): + """Test price range filtering""" + + result = scrape_property( + location="Charlotte, NC", + listing_type="for_sale", + price_min=200000, + price_max=500000, + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify prices are within range + for idx in range(min(15, len(result))): + price = result.iloc[idx]["list_price"] + if pd.notna(price) and price > 0: + assert 200000 <= price <= 500000, f"Price should be $200k-$500k, got ${price}" + + +def test_lot_sqft_filtering(): + """Test lot size filtering""" + + result = scrape_property( + location="Scottsdale, AZ", + listing_type="for_sale", + lot_sqft_min=5000, + lot_sqft_max=15000, + limit=30 + ) + + assert result is not None + # Results might be fewer if lot_sqft data is sparse + + +def test_year_built_filtering(): + """Test year built filtering""" + + result = scrape_property( + location="Tampa, FL", + listing_type="for_sale", + year_built_min=2000, + year_built_max=2024, + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify year_built is within range + for idx in range(min(10, len(result))): + year = result.iloc[idx]["year_built"] + if pd.notna(year) and year > 0: + assert 2000 <= year <= 2024, f"Year should be 2000-2024, got {year}" + + +def test_combined_filters(): + """Test multiple filters working together""" + + result = scrape_property( + location="Nashville, TN", + listing_type="for_sale", + beds_min=3, + baths_min=2.0, + sqft_min=1500, + price_min=250000, + price_max=600000, + year_built_min=1990, + limit=30 + ) + + assert result is not None + + # If we get results, verify they meet ALL criteria + if len(result) > 0: + for idx in range(min(5, len(result))): + row = result.iloc[idx] + + # Check beds + if pd.notna(row["beds"]): + assert row["beds"] >= 3, f"Beds should be >= 3, got {row['beds']}" + + # Check sqft + if pd.notna(row["sqft"]) and row["sqft"] > 0: + assert row["sqft"] >= 1500, f"Sqft should be >= 1500, got {row['sqft']}" + + # Check price + if pd.notna(row["list_price"]) and row["list_price"] > 0: + assert 250000 <= row["list_price"] <= 600000, \ + f"Price should be $250k-$600k, got ${row['list_price']}" + + # Check year + if pd.notna(row["year_built"]) and row["year_built"] > 0: + assert row["year_built"] >= 1990, \ + f"Year should be >= 1990, got {row['year_built']}" + + +def test_sorting_by_price(): + """Test sorting by list_price - note API sorting may not be perfect""" + + # Sort ascending (cheapest first) + result_asc = scrape_property( + location="Orlando, FL", + listing_type="for_sale", + sort_by="list_price", + sort_direction="asc", + limit=20 + ) + + assert result_asc is not None and len(result_asc) > 0 + + # Sort descending (most expensive first) + result_desc = scrape_property( + location="San Antonio, TX", + listing_type="for_sale", + sort_by="list_price", + sort_direction="desc", + limit=20 + ) + + assert result_desc is not None and len(result_desc) > 0 + + # Note: Realtor API sorting may not be perfectly reliable for all search types + # The test ensures the sort parameters don't cause errors, actual sort order may vary + + +def test_sorting_by_date(): + """Test sorting by list_date - note API sorting may not be perfect""" + + result = scrape_property( + location="Columbus, OH", + listing_type="for_sale", + sort_by="list_date", + sort_direction="desc", # Newest first + limit=20 + ) + + assert result is not None and len(result) > 0 + + # Test ensures sort parameter doesn't cause errors + # Note: Realtor API sorting may not be perfectly reliable for all search types + + +def test_sorting_by_sqft(): + """Test sorting by square footage - note API sorting may not be perfect""" + + result = scrape_property( + location="Indianapolis, IN", + listing_type="for_sale", + sort_by="sqft", + sort_direction="desc", # Largest first + limit=20 + ) + + assert result is not None and len(result) > 0 + + # Test ensures sort parameter doesn't cause errors + # Note: Realtor API sorting may not be perfectly reliable for all search types + + +def test_filter_validation_errors(): + """Test that validation catches invalid parameters""" + import pytest + + # Test: beds_min > beds_max should raise ValueError + with pytest.raises(ValueError, match="beds_min.*cannot be greater than.*beds_max"): + scrape_property( + location="Boston, MA", + listing_type="for_sale", + beds_min=5, + beds_max=2, + limit=10 + ) + + # Test: invalid datetime format should raise exception + with pytest.raises(Exception): # InvalidDate + scrape_property( + location="Boston, MA", + listing_type="for_sale", + datetime_from="not-a-valid-datetime", + limit=10 + ) + + # Test: invalid sort_by value should raise ValueError + with pytest.raises(ValueError, match="Invalid sort_by"): + scrape_property( + location="Boston, MA", + listing_type="for_sale", + sort_by="invalid_field", + limit=10 + ) + + # Test: invalid sort_direction should raise ValueError + with pytest.raises(ValueError, match="Invalid sort_direction"): + scrape_property( + location="Boston, MA", + listing_type="for_sale", + sort_by="list_price", + sort_direction="invalid", + limit=10 + ) + + +def test_backward_compatibility(): + """Ensure old parameters still work as expected""" + + # Test past_days still works + result_past_days = scrape_property( + location="Las Vegas, NV", + listing_type="sold", + past_days=30, + limit=20 + ) + + assert result_past_days is not None and len(result_past_days) > 0 + + # Test date_from/date_to still work + result_date_range = scrape_property( + location="Memphis, TN", + listing_type="sold", + date_from="2024-01-01", + date_to="2024-03-31", + limit=20 + ) + + assert result_date_range is not None + + # Test property_type still works + result_property_type = scrape_property( + location="Louisville, KY", + listing_type="for_sale", + property_type=["single_family"], + limit=20 + ) + + assert result_property_type is not None and len(result_property_type) > 0 + + # Test foreclosure still works + result_foreclosure = scrape_property( + location="Detroit, MI", + listing_type="for_sale", + foreclosure=True, + limit=15 + ) + + assert result_foreclosure is not None \ No newline at end of file