diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 6486931..f70aff1 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -522,6 +522,11 @@ class RealtorScraper(Scraper): if self.sort_by: homes = self._apply_sort(homes) + # Apply raw data filters (exclude_pending and mls_only) for raw return type + # These filters are normally applied in process_property() but are bypassed for raw data + if self.return_type == ReturnType.raw: + homes = self._apply_raw_data_filters(homes) + return homes def _apply_hour_based_date_filter(self, homes): @@ -800,6 +805,47 @@ class RealtorScraper(Scraper): return sorted_homes + def _apply_raw_data_filters(self, homes): + """Apply exclude_pending and mls_only filters for raw data returns. + + These filters are normally applied in process_property(), but that function + is bypassed when return_type="raw", so we need to apply them here instead. + + Args: + homes: List of properties (either dicts or Property objects) + + Returns: + Filtered list of properties + """ + if not homes: + return homes + + # Only filter raw data (dict objects) + # Property objects have already been filtered in process_property() + if homes and not isinstance(homes[0], dict): + return homes + + filtered_homes = [] + + for home in homes: + # Apply exclude_pending filter + if self.exclude_pending and self.listing_type != ListingType.PENDING: + flags = home.get('flags', {}) + is_pending = flags.get('is_pending', False) + is_contingent = flags.get('is_contingent', False) + + if is_pending or is_contingent: + continue # Skip this property + + # Apply mls_only filter + if self.mls_only: + source = home.get('source', {}) + if not source or not source.get('id'): + continue # Skip this property + + filtered_homes.append(home) + + return filtered_homes @retry( diff --git a/pyproject.toml b/pyproject.toml index 0c276da..862b36c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.7.2" +version = "0.7.3" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 8cb6bf1..c5fd493 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -1269,4 +1269,84 @@ def test_last_status_change_date_hour_filtering(): assert pending_date >= cutoff_time, \ f"PENDING property pending_date {pending_date} should be within 48 hours of {cutoff_time}" except (ValueError, TypeError): - pass # Skip if parsing fails \ No newline at end of file + pass # Skip if parsing fails + + +def test_exclude_pending_with_raw_data(): + """Test that exclude_pending parameter works correctly with return_type='raw'""" + + # Query for sale properties with exclude_pending=True and raw data + result = scrape_property( + location="Phoenix, AZ", + listing_type="for_sale", + exclude_pending=True, + return_type="raw", + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify that no pending or contingent properties are in the results + for prop in result: + flags = prop.get('flags', {}) + is_pending = flags.get('is_pending', False) + is_contingent = flags.get('is_contingent', False) + + assert not is_pending, f"Property {prop.get('property_id')} should not be pending when exclude_pending=True" + assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent when exclude_pending=True" + + +def test_mls_only_with_raw_data(): + """Test that mls_only parameter works correctly with return_type='raw'""" + + # Query with mls_only=True and raw data + result = scrape_property( + location="Dallas, TX", + listing_type="for_sale", + mls_only=True, + return_type="raw", + limit=50 + ) + + assert result is not None and len(result) > 0 + + # Verify that all properties have MLS IDs (stored in source.id) + for prop in result: + source = prop.get('source', {}) + mls_id = source.get('id') if source else None + + assert mls_id is not None and mls_id != "", \ + f"Property {prop.get('property_id')} should have an MLS ID (source.id) when mls_only=True, got: {mls_id}" + + +def test_combined_filters_with_raw_data(): + """Test that both exclude_pending and mls_only work together with return_type='raw'""" + + # Query with both filters enabled and raw data + result = scrape_property( + location="Austin, TX", + listing_type="for_sale", + exclude_pending=True, + mls_only=True, + return_type="raw", + limit=30 + ) + + assert result is not None and len(result) > 0 + + # Verify both filters are applied + for prop in result: + # Check exclude_pending filter + flags = prop.get('flags', {}) + is_pending = flags.get('is_pending', False) + is_contingent = flags.get('is_contingent', False) + + assert not is_pending, f"Property {prop.get('property_id')} should not be pending" + assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent" + + # Check mls_only filter + source = prop.get('source', {}) + mls_id = source.get('id') if source else None + + assert mls_id is not None and mls_id != "", \ + f"Property {prop.get('property_id')} should have an MLS ID (source.id)" \ No newline at end of file