mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 03:54:29 -08:00
Fix exclude_pending and mls_only filters not working with raw return type
When return_type="raw" was specified, the exclude_pending and mls_only parameters were ignored because these filters only existed in process_property(), which is bypassed for raw data returns. Changes: - Added _apply_raw_data_filters() method to handle client-side filtering for raw data - Applied the filter in search() method after sorting but before returning - Fixed exclude_pending to check flags.is_pending and flags.is_contingent - Fixed mls_only to check source.id (not mls.id which doesn't exist in raw data) - Added comprehensive tests for both filters with raw data Fixes #140 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -522,6 +522,11 @@ class RealtorScraper(Scraper):
|
|||||||
if self.sort_by:
|
if self.sort_by:
|
||||||
homes = self._apply_sort(homes)
|
homes = self._apply_sort(homes)
|
||||||
|
|
||||||
|
# Apply raw data filters (exclude_pending and mls_only) for raw return type
|
||||||
|
# These filters are normally applied in process_property() but are bypassed for raw data
|
||||||
|
if self.return_type == ReturnType.raw:
|
||||||
|
homes = self._apply_raw_data_filters(homes)
|
||||||
|
|
||||||
return homes
|
return homes
|
||||||
|
|
||||||
def _apply_hour_based_date_filter(self, homes):
|
def _apply_hour_based_date_filter(self, homes):
|
||||||
@@ -800,6 +805,47 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
return sorted_homes
|
return sorted_homes
|
||||||
|
|
||||||
|
def _apply_raw_data_filters(self, homes):
|
||||||
|
"""Apply exclude_pending and mls_only filters for raw data returns.
|
||||||
|
|
||||||
|
These filters are normally applied in process_property(), but that function
|
||||||
|
is bypassed when return_type="raw", so we need to apply them here instead.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
homes: List of properties (either dicts or Property objects)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Filtered list of properties
|
||||||
|
"""
|
||||||
|
if not homes:
|
||||||
|
return homes
|
||||||
|
|
||||||
|
# Only filter raw data (dict objects)
|
||||||
|
# Property objects have already been filtered in process_property()
|
||||||
|
if homes and not isinstance(homes[0], dict):
|
||||||
|
return homes
|
||||||
|
|
||||||
|
filtered_homes = []
|
||||||
|
|
||||||
|
for home in homes:
|
||||||
|
# Apply exclude_pending filter
|
||||||
|
if self.exclude_pending and self.listing_type != ListingType.PENDING:
|
||||||
|
flags = home.get('flags', {})
|
||||||
|
is_pending = flags.get('is_pending', False)
|
||||||
|
is_contingent = flags.get('is_contingent', False)
|
||||||
|
|
||||||
|
if is_pending or is_contingent:
|
||||||
|
continue # Skip this property
|
||||||
|
|
||||||
|
# Apply mls_only filter
|
||||||
|
if self.mls_only:
|
||||||
|
source = home.get('source', {})
|
||||||
|
if not source or not source.get('id'):
|
||||||
|
continue # Skip this property
|
||||||
|
|
||||||
|
filtered_homes.append(home)
|
||||||
|
|
||||||
|
return filtered_homes
|
||||||
|
|
||||||
|
|
||||||
@retry(
|
@retry(
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.7.2"
|
version = "0.7.3"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
|
|||||||
@@ -1270,3 +1270,83 @@ def test_last_status_change_date_hour_filtering():
|
|||||||
f"PENDING property pending_date {pending_date} should be within 48 hours of {cutoff_time}"
|
f"PENDING property pending_date {pending_date} should be within 48 hours of {cutoff_time}"
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass # Skip if parsing fails
|
pass # Skip if parsing fails
|
||||||
|
|
||||||
|
|
||||||
|
def test_exclude_pending_with_raw_data():
|
||||||
|
"""Test that exclude_pending parameter works correctly with return_type='raw'"""
|
||||||
|
|
||||||
|
# Query for sale properties with exclude_pending=True and raw data
|
||||||
|
result = scrape_property(
|
||||||
|
location="Phoenix, AZ",
|
||||||
|
listing_type="for_sale",
|
||||||
|
exclude_pending=True,
|
||||||
|
return_type="raw",
|
||||||
|
limit=50
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None and len(result) > 0
|
||||||
|
|
||||||
|
# Verify that no pending or contingent properties are in the results
|
||||||
|
for prop in result:
|
||||||
|
flags = prop.get('flags', {})
|
||||||
|
is_pending = flags.get('is_pending', False)
|
||||||
|
is_contingent = flags.get('is_contingent', False)
|
||||||
|
|
||||||
|
assert not is_pending, f"Property {prop.get('property_id')} should not be pending when exclude_pending=True"
|
||||||
|
assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent when exclude_pending=True"
|
||||||
|
|
||||||
|
|
||||||
|
def test_mls_only_with_raw_data():
|
||||||
|
"""Test that mls_only parameter works correctly with return_type='raw'"""
|
||||||
|
|
||||||
|
# Query with mls_only=True and raw data
|
||||||
|
result = scrape_property(
|
||||||
|
location="Dallas, TX",
|
||||||
|
listing_type="for_sale",
|
||||||
|
mls_only=True,
|
||||||
|
return_type="raw",
|
||||||
|
limit=50
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None and len(result) > 0
|
||||||
|
|
||||||
|
# Verify that all properties have MLS IDs (stored in source.id)
|
||||||
|
for prop in result:
|
||||||
|
source = prop.get('source', {})
|
||||||
|
mls_id = source.get('id') if source else None
|
||||||
|
|
||||||
|
assert mls_id is not None and mls_id != "", \
|
||||||
|
f"Property {prop.get('property_id')} should have an MLS ID (source.id) when mls_only=True, got: {mls_id}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_combined_filters_with_raw_data():
|
||||||
|
"""Test that both exclude_pending and mls_only work together with return_type='raw'"""
|
||||||
|
|
||||||
|
# Query with both filters enabled and raw data
|
||||||
|
result = scrape_property(
|
||||||
|
location="Austin, TX",
|
||||||
|
listing_type="for_sale",
|
||||||
|
exclude_pending=True,
|
||||||
|
mls_only=True,
|
||||||
|
return_type="raw",
|
||||||
|
limit=30
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None and len(result) > 0
|
||||||
|
|
||||||
|
# Verify both filters are applied
|
||||||
|
for prop in result:
|
||||||
|
# Check exclude_pending filter
|
||||||
|
flags = prop.get('flags', {})
|
||||||
|
is_pending = flags.get('is_pending', False)
|
||||||
|
is_contingent = flags.get('is_contingent', False)
|
||||||
|
|
||||||
|
assert not is_pending, f"Property {prop.get('property_id')} should not be pending"
|
||||||
|
assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent"
|
||||||
|
|
||||||
|
# Check mls_only filter
|
||||||
|
source = prop.get('source', {})
|
||||||
|
mls_id = source.get('id') if source else None
|
||||||
|
|
||||||
|
assert mls_id is not None and mls_id != "", \
|
||||||
|
f"Property {prop.get('property_id')} should have an MLS ID (source.id)"
|
||||||
Reference in New Issue
Block a user