From 4e6e144617a2819f38035f7259688c2f9129f2d2 Mon Sep 17 00:00:00 2001 From: Zachary Hampton Date: Mon, 10 Nov 2025 11:21:28 -0800 Subject: [PATCH] Fix exclude_pending and mls_only filters not working with raw return type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When return_type="raw" was specified, the exclude_pending and mls_only parameters were ignored because these filters only existed in process_property(), which is bypassed for raw data returns. Changes: - Added _apply_raw_data_filters() method to handle client-side filtering for raw data - Applied the filter in search() method after sorting but before returning - Fixed exclude_pending to check flags.is_pending and flags.is_contingent - Fixed mls_only to check source.id (not mls.id which doesn't exist in raw data) - Added comprehensive tests for both filters with raw data Fixes #140 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- homeharvest/core/scrapers/realtor/__init__.py | 46 +++++++++++ pyproject.toml | 2 +- tests/test_realtor.py | 82 ++++++++++++++++++- 3 files changed, 128 insertions(+), 2 deletions(-) diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 6486931..f70aff1 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -522,6 +522,11 @@ class RealtorScraper(Scraper): if self.sort_by: homes = self._apply_sort(homes) + # Apply raw data filters (exclude_pending and mls_only) for raw return type + # These filters are normally applied in process_property() but are bypassed for raw data + if self.return_type == ReturnType.raw: + homes = self._apply_raw_data_filters(homes) + return homes def _apply_hour_based_date_filter(self, homes): @@ -800,6 +805,47 @@ class RealtorScraper(Scraper): return sorted_homes + def _apply_raw_data_filters(self, homes): + """Apply exclude_pending and mls_only filters for raw data returns. + + These filters are normally applied in process_property(), but that function + is bypassed when return_type="raw", so we need to apply them here instead. + + Args: + homes: List of properties (either dicts or Property objects) + + Returns: + Filtered list of properties + """ + if not homes: + return homes + + # Only filter raw data (dict objects) + # Property objects have already been filtered in process_property() + if homes and not isinstance(homes[0], dict): + return homes + + filtered_homes = [] + + for home in homes: + # Apply exclude_pending filter + if self.exclude_pending and self.listing_type != ListingType.PENDING: + flags = home.get('flags', {}) + is_pending = flags.get('is_pending', False) + is_contingent = flags.get('is_contingent', False) + + if is_pending or is_contingent: + continue # Skip this property + + # Apply mls_only filter + if self.mls_only: + source = home.get('source', {}) + if not source or not source.get('id'): + continue # Skip this property + + filtered_homes.append(home) + + return filtered_homes @retry( diff --git a/pyproject.toml b/pyproject.toml index 0c276da..862b36c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.7.2" +version = "0.7.3" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 8cb6bf1..c5fd493 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -1269,4 +1269,84 @@ def test_last_status_change_date_hour_filtering(): assert pending_date >= cutoff_time, \ f"PENDING property pending_date {pending_date} should be within 48 hours of {cutoff_time}" except (ValueError, TypeError): - pass # Skip if parsing fails \ No newline at end of file + pass # Skip if parsing fails + + +def test_exclude_pending_with_raw_data(): + """Test that exclude_pending parameter works correctly with return_type='raw'""" + + # Query for sale properties with exclude_pending=True and raw data + result = scrape_property( + location="Phoenix, AZ", + listing_type="for_sale", + exclude_pending=True, + return_type="raw", + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify that no pending or contingent properties are in the results + for prop in result: + flags = prop.get('flags', {}) + is_pending = flags.get('is_pending', False) + is_contingent = flags.get('is_contingent', False) + + assert not is_pending, f"Property {prop.get('property_id')} should not be pending when exclude_pending=True" + assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent when exclude_pending=True" + + +def test_mls_only_with_raw_data(): + """Test that mls_only parameter works correctly with return_type='raw'""" + + # Query with mls_only=True and raw data + result = scrape_property( + location="Dallas, TX", + listing_type="for_sale", + mls_only=True, + return_type="raw", + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify that all properties have MLS IDs (stored in source.id) + for prop in result: + source = prop.get('source', {}) + mls_id = source.get('id') if source else None + + assert mls_id is not None and mls_id != "", \ + f"Property {prop.get('property_id')} should have an MLS ID (source.id) when mls_only=True, got: {mls_id}" + + +def test_combined_filters_with_raw_data(): + """Test that both exclude_pending and mls_only work together with return_type='raw'""" + + # Query with both filters enabled and raw data + result = scrape_property( + location="Austin, TX", + listing_type="for_sale", + exclude_pending=True, + mls_only=True, + return_type="raw", + limit=30 + ) + + assert result is not None and len(result) > 0 + + # Verify both filters are applied + for prop in result: + # Check exclude_pending filter + flags = prop.get('flags', {}) + is_pending = flags.get('is_pending', False) + is_contingent = flags.get('is_contingent', False) + + assert not is_pending, f"Property {prop.get('property_id')} should not be pending" + assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent" + + # Check mls_only filter + source = prop.get('source', {}) + mls_id = source.get('id') if source else None + + assert mls_id is not None and mls_id != "", \ + f"Property {prop.get('property_id')} should have an MLS ID (source.id)" \ No newline at end of file