mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 12:04:31 -08:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9b05ebd9d | ||
|
|
e9bfd66986 | ||
|
|
2fdebf1f20 | ||
|
|
23a8fd6a77 | ||
|
|
75c245cde7 |
@@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
**HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings.
|
**HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings.
|
||||||
|
|
||||||
|
- 🚀 [HomeHarvest MCP](https://smithery.ai/server/@ZacharyHampton/homeharvest-mcp) - Easily get property data in your agent.
|
||||||
|
- 🏠 [Zillow API](https://rapidapi.com/zachary-l1izVlvs2/api/zillow-com9) - Get Zillow data with ease.
|
||||||
|
|
||||||
## HomeHarvest Features
|
## HomeHarvest Features
|
||||||
|
|
||||||
- **Source**: Fetches properties directly from **Realtor.com**.
|
- **Source**: Fetches properties directly from **Realtor.com**.
|
||||||
|
|||||||
@@ -32,6 +32,9 @@ def scrape_property(
|
|||||||
:param mls_only: If set, fetches only listings with MLS IDs.
|
:param mls_only: If set, fetches only listings with MLS IDs.
|
||||||
:param proxy: Proxy to use for scraping
|
:param proxy: Proxy to use for scraping
|
||||||
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
|
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
|
||||||
|
- PENDING: Filters by pending_date. Contingent properties without pending_date are included.
|
||||||
|
- SOLD: Filters by sold_date (when property was sold)
|
||||||
|
- FOR_SALE/FOR_RENT: Filters by list_date (when property was listed)
|
||||||
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
|
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
|
||||||
:param foreclosure: If set, fetches only foreclosure listings.
|
:param foreclosure: If set, fetches only foreclosure listings.
|
||||||
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
||||||
|
|||||||
@@ -137,6 +137,10 @@ class RealtorScraper(Scraper):
|
|||||||
date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
|
date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
|
||||||
elif self.last_x_days:
|
elif self.last_x_days:
|
||||||
date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}'
|
date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}'
|
||||||
|
elif self.listing_type == ListingType.PENDING:
|
||||||
|
# Skip server-side date filtering for PENDING as both pending_date and contract_date
|
||||||
|
# filters are broken in the API. Client-side filtering will be applied later.
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
if self.date_from and self.date_to:
|
if self.date_from and self.date_to:
|
||||||
date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
|
date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
|
||||||
@@ -378,8 +382,126 @@ class RealtorScraper(Scraper):
|
|||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
homes.extend(future.result()["properties"])
|
homes.extend(future.result()["properties"])
|
||||||
|
|
||||||
|
# Apply client-side date filtering for PENDING properties
|
||||||
|
# (server-side filters are broken in the API)
|
||||||
|
if self.listing_type == ListingType.PENDING and (self.last_x_days or self.date_from):
|
||||||
|
homes = self._apply_pending_date_filter(homes)
|
||||||
|
|
||||||
return homes
|
return homes
|
||||||
|
|
||||||
|
def _apply_pending_date_filter(self, homes):
|
||||||
|
"""Apply client-side date filtering for PENDING properties based on pending_date field.
|
||||||
|
For contingent properties without pending_date, tries fallback date fields."""
|
||||||
|
if not homes:
|
||||||
|
return homes
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
# Determine date range for filtering
|
||||||
|
date_range = self._get_date_range()
|
||||||
|
if not date_range:
|
||||||
|
return homes
|
||||||
|
|
||||||
|
filtered_homes = []
|
||||||
|
|
||||||
|
for home in homes:
|
||||||
|
# Extract the best available date for this property
|
||||||
|
property_date = self._extract_property_date_for_filtering(home)
|
||||||
|
|
||||||
|
# Handle properties without dates (include contingent properties)
|
||||||
|
if property_date is None:
|
||||||
|
if self._is_contingent(home):
|
||||||
|
filtered_homes.append(home) # Include contingent without date filter
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if property date falls within the specified range
|
||||||
|
if self._is_date_in_range(property_date, date_range):
|
||||||
|
filtered_homes.append(home)
|
||||||
|
|
||||||
|
return filtered_homes
|
||||||
|
|
||||||
|
def _get_pending_date(self, home):
|
||||||
|
"""Extract pending_date from a home property (handles both dict and Property object)."""
|
||||||
|
if isinstance(home, dict):
|
||||||
|
return home.get('pending_date')
|
||||||
|
else:
|
||||||
|
# Assume it's a Property object
|
||||||
|
return getattr(home, 'pending_date', None)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_contingent(self, home):
|
||||||
|
"""Check if a property is contingent."""
|
||||||
|
if isinstance(home, dict):
|
||||||
|
flags = home.get('flags', {})
|
||||||
|
return flags.get('is_contingent', False)
|
||||||
|
else:
|
||||||
|
# Property object - check flags attribute
|
||||||
|
if hasattr(home, 'flags') and home.flags:
|
||||||
|
return getattr(home.flags, 'is_contingent', False)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _get_date_range(self):
|
||||||
|
"""Get the date range for filtering based on instance parameters."""
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
if self.last_x_days:
|
||||||
|
cutoff_date = datetime.now() - timedelta(days=self.last_x_days)
|
||||||
|
return {'type': 'since', 'date': cutoff_date}
|
||||||
|
elif self.date_from and self.date_to:
|
||||||
|
try:
|
||||||
|
from_date = datetime.fromisoformat(self.date_from)
|
||||||
|
to_date = datetime.fromisoformat(self.date_to)
|
||||||
|
return {'type': 'range', 'from_date': from_date, 'to_date': to_date}
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_property_date_for_filtering(self, home):
|
||||||
|
"""Extract pending_date from a property for filtering.
|
||||||
|
|
||||||
|
Returns parsed datetime object or None.
|
||||||
|
"""
|
||||||
|
date_value = self._get_pending_date(home)
|
||||||
|
if date_value:
|
||||||
|
return self._parse_date_value(date_value)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_date_value(self, date_value):
|
||||||
|
"""Parse a date value (string or datetime) into a timezone-naive datetime object."""
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
if isinstance(date_value, datetime):
|
||||||
|
return date_value.replace(tzinfo=None)
|
||||||
|
|
||||||
|
if not isinstance(date_value, str):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Handle timezone indicators
|
||||||
|
if date_value.endswith('Z'):
|
||||||
|
date_value = date_value[:-1] + '+00:00'
|
||||||
|
elif '.' in date_value and date_value.endswith('Z'):
|
||||||
|
date_value = date_value.replace('Z', '+00:00')
|
||||||
|
|
||||||
|
# Try ISO format first
|
||||||
|
try:
|
||||||
|
parsed_date = datetime.fromisoformat(date_value)
|
||||||
|
return parsed_date.replace(tzinfo=None)
|
||||||
|
except ValueError:
|
||||||
|
# Try simple datetime format: '2025-08-29 00:00:00'
|
||||||
|
return datetime.strptime(date_value, '%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _is_date_in_range(self, date_obj, date_range):
|
||||||
|
"""Check if a datetime object falls within the specified date range."""
|
||||||
|
if date_range['type'] == 'since':
|
||||||
|
return date_obj >= date_range['date']
|
||||||
|
elif date_range['type'] == 'range':
|
||||||
|
return date_range['from_date'] <= date_obj <= date_range['to_date']
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@retry(
|
@retry(
|
||||||
|
|||||||
@@ -202,6 +202,11 @@ fragment HomeData on Home {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
|
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
|
||||||
|
property_history {
|
||||||
|
date
|
||||||
|
event_name
|
||||||
|
price
|
||||||
|
}
|
||||||
monthly_fees {
|
monthly_fees {
|
||||||
description
|
description
|
||||||
display_amount
|
display_amount
|
||||||
|
|||||||
@@ -1,16 +1,16 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.5.2"
|
version = "0.6.2"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
homeharvest = "homeharvest.cli:main"
|
homeharvest = "homeharvest.cli:main"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.9,<3.13"
|
python = ">=3.9"
|
||||||
requests = "^2.32.4"
|
requests = "^2.32.4"
|
||||||
pandas = "^2.3.1"
|
pandas = "^2.3.1"
|
||||||
pydantic = "^2.11.7"
|
pydantic = "^2.11.7"
|
||||||
|
|||||||
@@ -373,3 +373,77 @@ def test_return_type_consistency():
|
|||||||
assert len(pandas_ids) > 0, f"pandas should return properties for {search_type}"
|
assert len(pandas_ids) > 0, f"pandas should return properties for {search_type}"
|
||||||
assert len(pydantic_ids) > 0, f"pydantic should return properties for {search_type}"
|
assert len(pydantic_ids) > 0, f"pydantic should return properties for {search_type}"
|
||||||
assert len(raw_ids) > 0, f"raw should return properties for {search_type}"
|
assert len(raw_ids) > 0, f"raw should return properties for {search_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_pending_date_filtering():
|
||||||
|
"""Test that pending properties are properly filtered by pending_date using client-side filtering."""
|
||||||
|
|
||||||
|
# Test 1: Verify that date filtering works with different time windows
|
||||||
|
result_no_filter = scrape_property(
|
||||||
|
location="Dallas, TX",
|
||||||
|
listing_type="pending",
|
||||||
|
limit=20
|
||||||
|
)
|
||||||
|
|
||||||
|
result_30_days = scrape_property(
|
||||||
|
location="Dallas, TX",
|
||||||
|
listing_type="pending",
|
||||||
|
past_days=30,
|
||||||
|
limit=20
|
||||||
|
)
|
||||||
|
|
||||||
|
result_10_days = scrape_property(
|
||||||
|
location="Dallas, TX",
|
||||||
|
listing_type="pending",
|
||||||
|
past_days=10,
|
||||||
|
limit=20
|
||||||
|
)
|
||||||
|
|
||||||
|
# Basic assertions - we should get some results
|
||||||
|
assert result_no_filter is not None and len(result_no_filter) >= 0
|
||||||
|
assert result_30_days is not None and len(result_30_days) >= 0
|
||||||
|
assert result_10_days is not None and len(result_10_days) >= 0
|
||||||
|
|
||||||
|
# Filtering should work: longer periods should return same or more results
|
||||||
|
assert len(result_30_days) <= len(result_no_filter), "30-day filter should return <= unfiltered results"
|
||||||
|
assert len(result_10_days) <= len(result_30_days), "10-day filter should return <= 30-day results"
|
||||||
|
|
||||||
|
# Test 2: Verify that date range filtering works
|
||||||
|
if len(result_no_filter) > 0:
|
||||||
|
result_date_range = scrape_property(
|
||||||
|
location="Dallas, TX",
|
||||||
|
listing_type="pending",
|
||||||
|
date_from="2025-08-01",
|
||||||
|
date_to="2025-12-31",
|
||||||
|
limit=20
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result_date_range is not None
|
||||||
|
# Date range should capture recent properties
|
||||||
|
assert len(result_date_range) >= 0
|
||||||
|
|
||||||
|
# Test 3: Verify that both pending and contingent properties are included
|
||||||
|
# Get raw data to check property types
|
||||||
|
if len(result_no_filter) > 0:
|
||||||
|
raw_result = scrape_property(
|
||||||
|
location="Dallas, TX",
|
||||||
|
listing_type="pending",
|
||||||
|
return_type="raw",
|
||||||
|
limit=15
|
||||||
|
)
|
||||||
|
|
||||||
|
if raw_result:
|
||||||
|
# Check that we get both pending and contingent properties
|
||||||
|
pending_count = 0
|
||||||
|
contingent_count = 0
|
||||||
|
|
||||||
|
for prop in raw_result:
|
||||||
|
flags = prop.get('flags', {})
|
||||||
|
if flags.get('is_pending'):
|
||||||
|
pending_count += 1
|
||||||
|
if flags.get('is_contingent'):
|
||||||
|
contingent_count += 1
|
||||||
|
|
||||||
|
# We should get at least one of each type (when available)
|
||||||
|
total_properties = pending_count + contingent_count
|
||||||
|
assert total_properties > 0, "Should find at least some pending or contingent properties"
|
||||||
Reference in New Issue
Block a user