mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-04 19:44:29 -08:00
This major enhancement addresses user needs for more precise filtering and introduces powerful new capabilities for property searches: Key Features: - Hour-based date filtering (past_hours, datetime_from/to with ISO 8601 support) - Server-side property filters (beds, baths, sqft, price, lot_sqft, year_built) - Sorting support (list_date, sold_date, list_price, sqft, beds, baths) - Full timestamp preservation (YYYY-MM-DD HH:MM:SS instead of date-only) - Comprehensive validation with helpful error messages Technical Changes: - Preserve full datetime precision in processors.py and parsers.py - Implement client-side hour-based post-filtering for all listing types - Add server-side GraphQL filters for property characteristics - Generalize filtering to work across SOLD, PENDING, FOR_SALE, FOR_RENT - Add 15 comprehensive tests covering all new features - Maintain full backward compatibility with existing parameters Fixes #113 (sorting support) Version bump to 0.7.0 reflects significant new functionality while maintaining backward compatibility. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
244 lines
8.5 KiB
Python
244 lines
8.5 KiB
Python
from __future__ import annotations
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from .core.scrapers.models import Property, ListingType, Advertisers
|
|
from .exceptions import InvalidListingType, InvalidDate
|
|
|
|
ordered_properties = [
|
|
"property_url",
|
|
"property_id",
|
|
"listing_id",
|
|
"permalink",
|
|
"mls",
|
|
"mls_id",
|
|
"status",
|
|
"mls_status",
|
|
"text",
|
|
"style",
|
|
"formatted_address",
|
|
"full_street_line",
|
|
"street",
|
|
"unit",
|
|
"city",
|
|
"state",
|
|
"zip_code",
|
|
"beds",
|
|
"full_baths",
|
|
"half_baths",
|
|
"sqft",
|
|
"year_built",
|
|
"days_on_mls",
|
|
"list_price",
|
|
"list_price_min",
|
|
"list_price_max",
|
|
"list_date",
|
|
"pending_date",
|
|
"sold_price",
|
|
"last_sold_date",
|
|
"last_sold_price",
|
|
"assessed_value",
|
|
"estimated_value",
|
|
"tax",
|
|
"tax_history",
|
|
"new_construction",
|
|
"lot_sqft",
|
|
"price_per_sqft",
|
|
"latitude",
|
|
"longitude",
|
|
"neighborhoods",
|
|
"county",
|
|
"fips_code",
|
|
"stories",
|
|
"hoa_fee",
|
|
"parking_garage",
|
|
"agent_id",
|
|
"agent_name",
|
|
"agent_email",
|
|
"agent_phones",
|
|
"agent_mls_set",
|
|
"agent_nrds_id",
|
|
"broker_id",
|
|
"broker_name",
|
|
"builder_id",
|
|
"builder_name",
|
|
"office_id",
|
|
"office_mls_set",
|
|
"office_name",
|
|
"office_email",
|
|
"office_phones",
|
|
"nearby_schools",
|
|
"primary_photo",
|
|
"alt_photos"
|
|
]
|
|
|
|
|
|
def process_result(result: Property) -> pd.DataFrame:
|
|
prop_data = {prop: None for prop in ordered_properties}
|
|
prop_data.update(result.model_dump())
|
|
|
|
if "address" in prop_data and prop_data["address"]:
|
|
address_data = prop_data["address"]
|
|
prop_data["full_street_line"] = address_data.get("full_line")
|
|
prop_data["street"] = address_data.get("street")
|
|
prop_data["unit"] = address_data.get("unit")
|
|
prop_data["city"] = address_data.get("city")
|
|
prop_data["state"] = address_data.get("state")
|
|
prop_data["zip_code"] = address_data.get("zip")
|
|
prop_data["formatted_address"] = address_data.get("formatted_address")
|
|
|
|
if "advertisers" in prop_data and prop_data.get("advertisers"):
|
|
advertiser_data = prop_data["advertisers"]
|
|
if advertiser_data.get("agent"):
|
|
agent_data = advertiser_data["agent"]
|
|
prop_data["agent_id"] = agent_data.get("uuid")
|
|
prop_data["agent_name"] = agent_data.get("name")
|
|
prop_data["agent_email"] = agent_data.get("email")
|
|
prop_data["agent_phones"] = agent_data.get("phones")
|
|
prop_data["agent_mls_set"] = agent_data.get("mls_set")
|
|
prop_data["agent_nrds_id"] = agent_data.get("nrds_id")
|
|
|
|
if advertiser_data.get("broker"):
|
|
broker_data = advertiser_data["broker"]
|
|
prop_data["broker_id"] = broker_data.get("uuid")
|
|
prop_data["broker_name"] = broker_data.get("name")
|
|
|
|
if advertiser_data.get("builder"):
|
|
builder_data = advertiser_data["builder"]
|
|
prop_data["builder_id"] = builder_data.get("uuid")
|
|
prop_data["builder_name"] = builder_data.get("name")
|
|
|
|
if advertiser_data.get("office"):
|
|
office_data = advertiser_data["office"]
|
|
prop_data["office_id"] = office_data.get("uuid")
|
|
prop_data["office_name"] = office_data.get("name")
|
|
prop_data["office_email"] = office_data.get("email")
|
|
prop_data["office_phones"] = office_data.get("phones")
|
|
prop_data["office_mls_set"] = office_data.get("mls_set")
|
|
|
|
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
|
|
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
|
|
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
|
|
|
|
# Convert datetime objects to strings for CSV (preserve full datetime including time)
|
|
for date_field in ["list_date", "pending_date", "last_sold_date"]:
|
|
if prop_data.get(date_field):
|
|
prop_data[date_field] = prop_data[date_field].strftime("%Y-%m-%d %H:%M:%S") if hasattr(prop_data[date_field], 'strftime') else prop_data[date_field]
|
|
|
|
# Convert HttpUrl objects to strings for CSV
|
|
if prop_data.get("property_url"):
|
|
prop_data["property_url"] = str(prop_data["property_url"])
|
|
|
|
description = result.description
|
|
if description:
|
|
prop_data["primary_photo"] = str(description.primary_photo) if description.primary_photo else None
|
|
prop_data["alt_photos"] = ", ".join(str(url) for url in description.alt_photos) if description.alt_photos else None
|
|
prop_data["style"] = (
|
|
description.style
|
|
if isinstance(description.style, str)
|
|
else description.style.value if description.style else None
|
|
)
|
|
prop_data["beds"] = description.beds
|
|
prop_data["full_baths"] = description.baths_full
|
|
prop_data["half_baths"] = description.baths_half
|
|
prop_data["sqft"] = description.sqft
|
|
prop_data["lot_sqft"] = description.lot_sqft
|
|
prop_data["sold_price"] = description.sold_price
|
|
prop_data["year_built"] = description.year_built
|
|
prop_data["parking_garage"] = description.garage
|
|
prop_data["stories"] = description.stories
|
|
prop_data["text"] = description.text
|
|
|
|
properties_df = pd.DataFrame([prop_data])
|
|
properties_df = properties_df.reindex(columns=ordered_properties)
|
|
|
|
return properties_df[ordered_properties]
|
|
|
|
|
|
def validate_input(listing_type: str) -> None:
|
|
if listing_type.upper() not in ListingType.__members__:
|
|
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
|
|
|
|
|
|
def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
|
if isinstance(date_from, str) != isinstance(date_to, str):
|
|
raise InvalidDate("Both date_from and date_to must be provided.")
|
|
|
|
if date_from and date_to:
|
|
try:
|
|
date_from_obj = datetime.strptime(date_from, "%Y-%m-%d")
|
|
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")
|
|
|
|
if date_to_obj < date_from_obj:
|
|
raise InvalidDate("date_to must be after date_from.")
|
|
except ValueError:
|
|
raise InvalidDate(f"Invalid date format or range")
|
|
|
|
|
|
def validate_limit(limit: int) -> None:
|
|
#: 1 -> 10000 limit
|
|
|
|
if limit is not None and (limit < 1 or limit > 10000):
|
|
raise ValueError("Property limit must be between 1 and 10,000.")
|
|
|
|
|
|
def validate_datetime(datetime_str: str | None) -> None:
|
|
"""Validate ISO 8601 datetime format."""
|
|
if not datetime_str:
|
|
return
|
|
|
|
try:
|
|
# Try parsing as ISO 8601 datetime
|
|
datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
|
|
except (ValueError, AttributeError):
|
|
raise InvalidDate(
|
|
f"Invalid datetime format: '{datetime_str}'. "
|
|
f"Expected ISO 8601 format (e.g., '2025-01-20T14:30:00' or '2025-01-20')."
|
|
)
|
|
|
|
|
|
def validate_filters(
|
|
beds_min: int | None = None,
|
|
beds_max: int | None = None,
|
|
baths_min: float | None = None,
|
|
baths_max: float | None = None,
|
|
sqft_min: int | None = None,
|
|
sqft_max: int | None = None,
|
|
price_min: int | None = None,
|
|
price_max: int | None = None,
|
|
lot_sqft_min: int | None = None,
|
|
lot_sqft_max: int | None = None,
|
|
year_built_min: int | None = None,
|
|
year_built_max: int | None = None,
|
|
) -> None:
|
|
"""Validate that min values are less than max values for range filters."""
|
|
ranges = [
|
|
("beds", beds_min, beds_max),
|
|
("baths", baths_min, baths_max),
|
|
("sqft", sqft_min, sqft_max),
|
|
("price", price_min, price_max),
|
|
("lot_sqft", lot_sqft_min, lot_sqft_max),
|
|
("year_built", year_built_min, year_built_max),
|
|
]
|
|
|
|
for name, min_val, max_val in ranges:
|
|
if min_val is not None and max_val is not None and min_val > max_val:
|
|
raise ValueError(f"{name}_min ({min_val}) cannot be greater than {name}_max ({max_val}).")
|
|
|
|
|
|
def validate_sort(sort_by: str | None, sort_direction: str | None = "desc") -> None:
|
|
"""Validate sort parameters."""
|
|
valid_sort_fields = ["list_date", "sold_date", "list_price", "sqft", "beds", "baths"]
|
|
valid_directions = ["asc", "desc"]
|
|
|
|
if sort_by and sort_by not in valid_sort_fields:
|
|
raise ValueError(
|
|
f"Invalid sort_by value: '{sort_by}'. "
|
|
f"Valid options: {', '.join(valid_sort_fields)}"
|
|
)
|
|
|
|
if sort_direction and sort_direction not in valid_directions:
|
|
raise ValueError(
|
|
f"Invalid sort_direction value: '{sort_direction}'. "
|
|
f"Valid options: {', '.join(valid_directions)}"
|
|
)
|