mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 12:04:31 -08:00
- Replace browser-based headers with iOS mobile app headers - Update GraphQL query names to match iOS app conventions (1:1 alignment) - Add _graphql_post() wrapper to centralize GraphQL calls with dynamic operation names - Simplify session management by removing unnecessary thread-local complexity - Add test_parallel_search_consistency test to verify concurrent request stability - Bump version from 0.8.6b to 0.8.7 Changes fix API flakiness under concurrent load - parallel consistency test now passes 100% (5/5 runs). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1641 lines
57 KiB
Python
1641 lines
57 KiB
Python
import pytz
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
from homeharvest import scrape_property, Property
|
|
import pandas as pd
|
|
|
|
|
|
def test_realtor_pending_or_contingent():
|
|
pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending")
|
|
|
|
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale", exclude_pending=True)
|
|
|
|
assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
|
|
assert len(pending_or_contingent_result) != len(regular_result)
|
|
|
|
|
|
def test_realtor_pending_comps():
|
|
pending_comps = scrape_property(
|
|
location="2530 Al Lipscomb Way",
|
|
radius=5,
|
|
past_days=180,
|
|
listing_type="pending",
|
|
)
|
|
|
|
for_sale_comps = scrape_property(
|
|
location="2530 Al Lipscomb Way",
|
|
radius=5,
|
|
past_days=180,
|
|
listing_type="for_sale",
|
|
)
|
|
|
|
sold_comps = scrape_property(
|
|
location="2530 Al Lipscomb Way",
|
|
radius=5,
|
|
past_days=180,
|
|
listing_type="sold",
|
|
)
|
|
|
|
results = [pending_comps, for_sale_comps, sold_comps]
|
|
assert all([result is not None for result in results])
|
|
|
|
#: assert all lengths are different
|
|
assert len(set([len(result) for result in results])) == len(results)
|
|
|
|
|
|
def test_realtor_sold_past():
|
|
result = scrape_property(
|
|
location="San Diego, CA",
|
|
past_days=30,
|
|
listing_type="sold",
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
|
|
def test_realtor_comps():
|
|
result = scrape_property(
|
|
location="2530 Al Lipscomb Way",
|
|
radius=0.5,
|
|
past_days=180,
|
|
listing_type="sold",
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
|
|
def test_realtor_last_x_days_sold():
|
|
days_result_30 = scrape_property(location="Dallas, TX", listing_type="sold", past_days=30)
|
|
|
|
days_result_10 = scrape_property(location="Dallas, TX", listing_type="sold", past_days=10)
|
|
|
|
assert all([result is not None for result in [days_result_30, days_result_10]]) and len(days_result_30) != len(
|
|
days_result_10
|
|
)
|
|
|
|
|
|
def test_realtor_date_range_sold():
|
|
days_result_30 = scrape_property(
|
|
location="Dallas, TX", listing_type="sold", date_from="2023-05-01", date_to="2023-05-28"
|
|
)
|
|
|
|
days_result_60 = scrape_property(
|
|
location="Dallas, TX", listing_type="sold", date_from="2023-04-01", date_to="2023-06-10"
|
|
)
|
|
|
|
assert all([result is not None for result in [days_result_30, days_result_60]]) and len(days_result_30) < len(
|
|
days_result_60
|
|
)
|
|
|
|
|
|
def test_listing_type_none_includes_sold():
|
|
"""Test that listing_type=None includes sold listings (issue #142)"""
|
|
# Get properties with listing_type=None (should include all common types)
|
|
result_none = scrape_property(
|
|
location="Warren, MI",
|
|
listing_type=None
|
|
)
|
|
|
|
# Verify we got results
|
|
assert result_none is not None and len(result_none) > 0
|
|
|
|
# Verify sold listings are included
|
|
status_types = set(result_none['status'].unique())
|
|
assert 'SOLD' in status_types, "SOLD listings should be included when listing_type=None"
|
|
|
|
# Verify we get multiple listing types (not just one)
|
|
assert len(status_types) > 1, "Should return multiple listing types when listing_type=None"
|
|
|
|
|
|
def test_realtor_single_property():
|
|
results = [
|
|
scrape_property(
|
|
location="15509 N 172nd Dr, Surprise, AZ 85388",
|
|
listing_type="for_sale",
|
|
),
|
|
scrape_property(
|
|
location="2530 Al Lipscomb Way",
|
|
listing_type="for_sale",
|
|
),
|
|
]
|
|
|
|
assert all([result is not None for result in results])
|
|
|
|
|
|
def test_realtor():
|
|
results = [
|
|
scrape_property(
|
|
location="2530 Al Lipscomb Way",
|
|
listing_type="for_sale",
|
|
),
|
|
scrape_property(
|
|
location="Phoenix, AZ", listing_type="for_rent", limit=1000
|
|
), #: does not support "city, state, USA" format
|
|
scrape_property(
|
|
location="Dallas, TX", listing_type="sold", limit=1000
|
|
), #: does not support "city, state, USA" format
|
|
scrape_property(location="85281"),
|
|
]
|
|
|
|
assert all([result is not None for result in results])
|
|
|
|
|
|
def test_realtor_city():
|
|
results = scrape_property(location="Atlanta, GA", listing_type="for_sale", limit=1000)
|
|
|
|
assert results is not None and len(results) > 0
|
|
|
|
|
|
def test_realtor_land():
|
|
results = scrape_property(location="Atlanta, GA", listing_type="for_sale", property_type=["land"], limit=1000)
|
|
|
|
assert results is not None and len(results) > 0
|
|
|
|
|
|
def test_realtor_bad_address():
|
|
bad_results = scrape_property(
|
|
location="abceefg ju098ot498hh9",
|
|
listing_type="for_sale",
|
|
)
|
|
|
|
if len(bad_results) == 0:
|
|
assert True
|
|
|
|
|
|
def test_realtor_foreclosed():
|
|
foreclosed = scrape_property(location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=True)
|
|
|
|
not_foreclosed = scrape_property(location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=False)
|
|
|
|
assert len(foreclosed) != len(not_foreclosed)
|
|
|
|
|
|
def test_realtor_agent():
|
|
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000, extra_property_data=False)
|
|
assert scraped["agent_name"].nunique() > 1
|
|
|
|
|
|
def test_realtor_without_extra_details():
|
|
results = [
|
|
scrape_property(
|
|
location="00741",
|
|
listing_type="sold",
|
|
limit=10,
|
|
extra_property_data=False,
|
|
),
|
|
scrape_property(
|
|
location="00741",
|
|
listing_type="sold",
|
|
limit=10,
|
|
extra_property_data=True,
|
|
),
|
|
]
|
|
|
|
# When extra_property_data=False, these fields should be None
|
|
extra_fields = ["nearby_schools", "assessed_value", "tax", "tax_history"]
|
|
|
|
# Check that all extra fields are None when extra_property_data=False
|
|
for field in extra_fields:
|
|
if field in results[0].columns:
|
|
assert results[0][field].isna().all(), f"Field '{field}' should be None when extra_property_data=False"
|
|
|
|
|
|
def test_pr_zip_code():
|
|
results = scrape_property(
|
|
location="00741",
|
|
listing_type="for_sale",
|
|
)
|
|
|
|
assert results is not None and len(results) > 0
|
|
|
|
|
|
def test_exclude_pending():
|
|
results = scrape_property(
|
|
location="33567",
|
|
listing_type="pending",
|
|
exclude_pending=True,
|
|
)
|
|
|
|
assert results is not None and len(results) > 0
|
|
|
|
|
|
def test_style_value_error():
|
|
results = scrape_property(
|
|
location="Alaska, AK",
|
|
listing_type="sold",
|
|
extra_property_data=False,
|
|
limit=1000,
|
|
)
|
|
|
|
assert results is not None and len(results) > 0
|
|
|
|
|
|
def test_primary_image_error():
|
|
results = scrape_property(
|
|
location="Spokane, PA",
|
|
listing_type="for_rent", # or (for_sale, for_rent, pending)
|
|
past_days=360,
|
|
radius=3,
|
|
extra_property_data=False,
|
|
)
|
|
|
|
assert results is not None and len(results) > 0
|
|
|
|
|
|
def test_limit():
|
|
over_limit = 876
|
|
extra_params = {"limit": over_limit}
|
|
|
|
over_results = scrape_property(
|
|
location="Waddell, AZ",
|
|
listing_type="for_sale",
|
|
**extra_params,
|
|
)
|
|
|
|
assert over_results is not None and len(over_results) <= over_limit
|
|
|
|
under_limit = 1
|
|
under_results = scrape_property(
|
|
location="Waddell, AZ",
|
|
listing_type="for_sale",
|
|
limit=under_limit,
|
|
)
|
|
|
|
assert under_results is not None and len(under_results) == under_limit
|
|
|
|
|
|
def test_apartment_list_price():
|
|
results = scrape_property(
|
|
location="Spokane, WA",
|
|
listing_type="for_rent", # or (for_sale, for_rent, pending)
|
|
extra_property_data=False,
|
|
)
|
|
|
|
assert results is not None
|
|
|
|
results = results[results["style"] == "APARTMENT"]
|
|
|
|
#: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max
|
|
assert (
|
|
len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(results)
|
|
> 0.5
|
|
)
|
|
|
|
|
|
def test_phone_number_matching():
|
|
searches = [
|
|
scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="for_sale",
|
|
limit=100,
|
|
),
|
|
scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="for_sale",
|
|
limit=100,
|
|
),
|
|
]
|
|
|
|
assert all([search is not None for search in searches])
|
|
|
|
#: random row
|
|
row = searches[0][searches[0]["agent_phones"].notnull()].sample()
|
|
|
|
#: find matching row
|
|
matching_row = searches[1].loc[searches[1]["property_url"] == row["property_url"].values[0]]
|
|
|
|
#: assert phone numbers are the same
|
|
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
|
|
|
|
|
|
def test_parallel_search_consistency():
|
|
"""Test that the same search executed 3 times in parallel returns consistent results"""
|
|
def search_task():
|
|
return scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="for_sale",
|
|
limit=100
|
|
)
|
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
futures = [executor.submit(search_task) for _ in range(3)]
|
|
results = [future.result() for future in as_completed(futures)]
|
|
|
|
# Verify all results are valid
|
|
assert all([result is not None for result in results])
|
|
assert all([isinstance(result, pd.DataFrame) for result in results])
|
|
assert all([len(result) > 0 for result in results])
|
|
|
|
# Verify all results have the same length (primary consistency check)
|
|
lengths = [len(result) for result in results]
|
|
assert len(set(lengths)) == 1, \
|
|
f"All parallel searches should return same number of results, got lengths: {lengths}"
|
|
|
|
|
|
def test_return_type():
|
|
results = {
|
|
"pandas": [scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100)],
|
|
"pydantic": [scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="pydantic")],
|
|
"raw": [
|
|
scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="raw"),
|
|
scrape_property(location="85281", listing_type="for_rent", limit=100, return_type="raw"),
|
|
],
|
|
}
|
|
|
|
assert all(isinstance(result, pd.DataFrame) for result in results["pandas"])
|
|
assert all(isinstance(result[0], Property) for result in results["pydantic"])
|
|
assert all(isinstance(result[0], dict) for result in results["raw"])
|
|
|
|
|
|
def test_has_open_house():
|
|
"""Test that open_houses field is present and properly structured when it exists"""
|
|
|
|
# Test that open_houses field exists in results (may be None if no open houses scheduled)
|
|
address_result = scrape_property("1 Hawthorne St Unit 12F, San Francisco, CA 94105", return_type="raw")
|
|
assert "open_houses" in address_result[0], "open_houses field should exist in address search results"
|
|
|
|
# Test general search also includes open_houses field
|
|
zip_code_result = scrape_property("94105", listing_type="for_sale", limit=50, return_type="raw")
|
|
assert len(zip_code_result) > 0, "Should have results from zip code search"
|
|
|
|
# Verify open_houses field exists in general search
|
|
assert "open_houses" in zip_code_result[0], "open_houses field should exist in general search results"
|
|
|
|
# If we find any properties with open houses, verify the data structure
|
|
properties_with_open_houses = [prop for prop in zip_code_result if prop.get("open_houses") is not None]
|
|
|
|
if properties_with_open_houses:
|
|
# Verify structure of open_houses data
|
|
first_with_open_house = properties_with_open_houses[0]
|
|
assert isinstance(first_with_open_house["open_houses"], (list, dict)), \
|
|
"open_houses should be a list or dict when present"
|
|
|
|
|
|
|
|
def test_return_type_consistency():
|
|
"""Test that return_type works consistently between general and address searches"""
|
|
|
|
# Test configurations - different search types
|
|
test_locations = [
|
|
("Dallas, TX", "general"), # General city search
|
|
("75201", "zip"), # ZIP code search
|
|
("2530 Al Lipscomb Way", "address") # Address search
|
|
]
|
|
|
|
for location, search_type in test_locations:
|
|
# Test all return types for each search type
|
|
pandas_result = scrape_property(
|
|
location=location,
|
|
listing_type="for_sale",
|
|
limit=3,
|
|
return_type="pandas"
|
|
)
|
|
|
|
pydantic_result = scrape_property(
|
|
location=location,
|
|
listing_type="for_sale",
|
|
limit=3,
|
|
return_type="pydantic"
|
|
)
|
|
|
|
raw_result = scrape_property(
|
|
location=location,
|
|
listing_type="for_sale",
|
|
limit=3,
|
|
return_type="raw"
|
|
)
|
|
|
|
# Validate pandas return type
|
|
assert isinstance(pandas_result, pd.DataFrame), f"pandas result should be DataFrame for {search_type}"
|
|
assert len(pandas_result) > 0, f"pandas result should not be empty for {search_type}"
|
|
|
|
required_columns = ["property_id", "property_url", "list_price", "status", "formatted_address"]
|
|
for col in required_columns:
|
|
assert col in pandas_result.columns, f"Missing column {col} in pandas result for {search_type}"
|
|
|
|
# Validate pydantic return type
|
|
assert isinstance(pydantic_result, list), f"pydantic result should be list for {search_type}"
|
|
assert len(pydantic_result) > 0, f"pydantic result should not be empty for {search_type}"
|
|
|
|
for item in pydantic_result:
|
|
assert isinstance(item, Property), f"pydantic items should be Property objects for {search_type}"
|
|
assert item.property_id is not None, f"property_id should not be None for {search_type}"
|
|
|
|
# Validate raw return type
|
|
assert isinstance(raw_result, list), f"raw result should be list for {search_type}"
|
|
assert len(raw_result) > 0, f"raw result should not be empty for {search_type}"
|
|
|
|
for item in raw_result:
|
|
assert isinstance(item, dict), f"raw items should be dict for {search_type}"
|
|
assert "property_id" in item, f"raw items should have property_id for {search_type}"
|
|
assert "href" in item, f"raw items should have href for {search_type}"
|
|
|
|
# Cross-validate that different return types return related data
|
|
pandas_ids = set(pandas_result["property_id"].tolist())
|
|
pydantic_ids = set(prop.property_id for prop in pydantic_result)
|
|
raw_ids = set(item["property_id"] for item in raw_result)
|
|
|
|
# All return types should have some properties
|
|
assert len(pandas_ids) > 0, f"pandas should return properties for {search_type}"
|
|
assert len(pydantic_ids) > 0, f"pydantic should return properties for {search_type}"
|
|
assert len(raw_ids) > 0, f"raw should return properties for {search_type}"
|
|
|
|
|
|
def test_pending_date_filtering():
|
|
"""Test that pending properties are properly filtered by pending_date using client-side filtering."""
|
|
|
|
# Test 1: Verify that date filtering works with different time windows
|
|
result_no_filter = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="pending",
|
|
limit=20
|
|
)
|
|
|
|
result_30_days = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="pending",
|
|
past_days=30,
|
|
limit=20
|
|
)
|
|
|
|
result_10_days = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="pending",
|
|
past_days=10,
|
|
limit=20
|
|
)
|
|
|
|
# Basic assertions - we should get some results
|
|
assert result_no_filter is not None and len(result_no_filter) >= 0
|
|
assert result_30_days is not None and len(result_30_days) >= 0
|
|
assert result_10_days is not None and len(result_10_days) >= 0
|
|
|
|
# Filtering should work: longer periods should return same or more results
|
|
assert len(result_30_days) <= len(result_no_filter), "30-day filter should return <= unfiltered results"
|
|
assert len(result_10_days) <= len(result_30_days), "10-day filter should return <= 30-day results"
|
|
|
|
# Test 2: Verify that date range filtering works
|
|
if len(result_no_filter) > 0:
|
|
result_date_range = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="pending",
|
|
date_from="2025-08-01",
|
|
date_to="2025-12-31",
|
|
limit=20
|
|
)
|
|
|
|
assert result_date_range is not None
|
|
# Date range should capture recent properties
|
|
assert len(result_date_range) >= 0
|
|
|
|
# Test 3: Verify that both pending and contingent properties are included
|
|
# Get raw data to check property types
|
|
if len(result_no_filter) > 0:
|
|
raw_result = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="pending",
|
|
return_type="raw",
|
|
limit=15
|
|
)
|
|
|
|
if raw_result:
|
|
# Check that we get both pending and contingent properties
|
|
pending_count = 0
|
|
contingent_count = 0
|
|
|
|
for prop in raw_result:
|
|
flags = prop.get('flags', {})
|
|
if flags.get('is_pending'):
|
|
pending_count += 1
|
|
if flags.get('is_contingent'):
|
|
contingent_count += 1
|
|
|
|
# We should get at least one of each type (when available)
|
|
total_properties = pending_count + contingent_count
|
|
assert total_properties > 0, "Should find at least some pending or contingent properties"
|
|
|
|
|
|
def test_hour_based_filtering():
|
|
"""Test the new past_hours parameter for hour-level filtering"""
|
|
from datetime import datetime, timedelta
|
|
|
|
# Test for sold properties with 24-hour filter
|
|
result_24h = scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="sold",
|
|
past_hours=24,
|
|
limit=50
|
|
)
|
|
|
|
# Test for sold properties with 12-hour filter
|
|
result_12h = scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="sold",
|
|
past_hours=12,
|
|
limit=50
|
|
)
|
|
|
|
assert result_24h is not None
|
|
assert result_12h is not None
|
|
|
|
# 12-hour filter should return same or fewer results than 24-hour
|
|
if len(result_12h) > 0 and len(result_24h) > 0:
|
|
assert len(result_12h) <= len(result_24h), "12-hour results should be <= 24-hour results"
|
|
|
|
# Verify timestamps are within the specified hour range for 24h filter
|
|
if len(result_24h) > 0:
|
|
cutoff_time = datetime.now() - timedelta(hours=24)
|
|
|
|
# Check a few results
|
|
for idx in range(min(5, len(result_24h))):
|
|
sold_date_str = result_24h.iloc[idx]["last_sold_date"]
|
|
if pd.notna(sold_date_str):
|
|
try:
|
|
sold_date = datetime.strptime(str(sold_date_str), "%Y-%m-%d %H:%M:%S")
|
|
# Date should be within last 24 hours
|
|
assert sold_date >= cutoff_time, f"Property sold date {sold_date} should be within last 24 hours"
|
|
except (ValueError, TypeError):
|
|
pass # Skip if date parsing fails
|
|
|
|
|
|
def test_past_hours_all_listing_types():
|
|
"""Validate that past_hours works correctly for all listing types with proper date fields"""
|
|
from datetime import datetime, timedelta
|
|
|
|
# Test 1: SOLD (uses last_sold_date field, server-side filters by sold_date)
|
|
result_sold = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="sold",
|
|
past_hours=48,
|
|
limit=20
|
|
)
|
|
|
|
assert result_sold is not None
|
|
if len(result_sold) > 0:
|
|
cutoff_48h = datetime.now() - timedelta(hours=48)
|
|
|
|
# Verify results use sold_date and are within 48 hours
|
|
for idx in range(min(5, len(result_sold))):
|
|
sold_date_str = result_sold.iloc[idx]["last_sold_date"]
|
|
if pd.notna(sold_date_str):
|
|
try:
|
|
sold_date = datetime.strptime(str(sold_date_str), "%Y-%m-%d %H:%M:%S")
|
|
assert sold_date >= cutoff_48h, \
|
|
f"SOLD: last_sold_date {sold_date} should be within 48 hours"
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Test 2: FOR_SALE (uses list_date field, server-side filters by list_date)
|
|
result_for_sale = scrape_property(
|
|
location="Austin, TX",
|
|
listing_type="for_sale",
|
|
past_hours=48,
|
|
limit=20
|
|
)
|
|
|
|
assert result_for_sale is not None
|
|
if len(result_for_sale) > 0:
|
|
cutoff_48h = datetime.now() - timedelta(hours=48)
|
|
|
|
# Verify results use list_date and are within 48 hours
|
|
for idx in range(min(5, len(result_for_sale))):
|
|
list_date_str = result_for_sale.iloc[idx]["list_date"]
|
|
if pd.notna(list_date_str):
|
|
try:
|
|
list_date = datetime.strptime(str(list_date_str), "%Y-%m-%d %H:%M:%S")
|
|
assert list_date >= cutoff_48h, \
|
|
f"FOR_SALE: list_date {list_date} should be within 48 hours"
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Test 3: FOR_RENT (uses list_date field, server-side filters by list_date)
|
|
result_for_rent = scrape_property(
|
|
location="Houston, TX",
|
|
listing_type="for_rent",
|
|
past_hours=72,
|
|
limit=20
|
|
)
|
|
|
|
assert result_for_rent is not None
|
|
if len(result_for_rent) > 0:
|
|
cutoff_72h = datetime.now() - timedelta(hours=72)
|
|
|
|
# Verify results use list_date and are within 72 hours
|
|
for idx in range(min(5, len(result_for_rent))):
|
|
list_date_str = result_for_rent.iloc[idx]["list_date"]
|
|
if pd.notna(list_date_str):
|
|
try:
|
|
list_date = datetime.strptime(str(list_date_str), "%Y-%m-%d %H:%M:%S")
|
|
assert list_date >= cutoff_72h, \
|
|
f"FOR_RENT: list_date {list_date} should be within 72 hours"
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Test 4: PENDING (uses pending_date field, client-side filtering only)
|
|
result_pending = scrape_property(
|
|
location="San Antonio, TX",
|
|
listing_type="pending",
|
|
past_hours=48,
|
|
limit=20
|
|
)
|
|
|
|
assert result_pending is not None
|
|
# Note: PENDING doesn't use server-side date filtering (API filters broken)
|
|
# Client-side filtering should still work via pending_date
|
|
if len(result_pending) > 0:
|
|
cutoff_48h = datetime.now() - timedelta(hours=48)
|
|
|
|
# Verify results use pending_date (or are contingent without date)
|
|
for idx in range(min(5, len(result_pending))):
|
|
pending_date_str = result_pending.iloc[idx]["pending_date"]
|
|
if pd.notna(pending_date_str):
|
|
try:
|
|
pending_date = datetime.strptime(str(pending_date_str), "%Y-%m-%d %H:%M:%S")
|
|
assert pending_date >= cutoff_48h, \
|
|
f"PENDING: pending_date {pending_date} should be within 48 hours"
|
|
except (ValueError, TypeError):
|
|
pass
|
|
# else: property is contingent without pending_date, which is allowed
|
|
|
|
|
|
def test_datetime_filtering():
|
|
"""Test date_from and date_to parameters with hour precision"""
|
|
from datetime import datetime, timedelta
|
|
|
|
# Get a recent date range (e.g., yesterday)
|
|
yesterday = datetime.now() - timedelta(days=1)
|
|
date_str = yesterday.strftime("%Y-%m-%d")
|
|
|
|
# Test filtering for business hours (9 AM to 5 PM) on a specific day
|
|
result = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="for_sale",
|
|
date_from=f"{date_str}T09:00:00",
|
|
date_to=f"{date_str}T17:00:00",
|
|
limit=30
|
|
)
|
|
|
|
assert result is not None
|
|
|
|
# Test with only date_from
|
|
result_from_only = scrape_property(
|
|
location="Houston, TX",
|
|
listing_type="for_sale",
|
|
date_from=f"{date_str}T00:00:00",
|
|
limit=30
|
|
)
|
|
|
|
assert result_from_only is not None
|
|
|
|
# Test with only date_to
|
|
result_to_only = scrape_property(
|
|
location="Austin, TX",
|
|
listing_type="for_sale",
|
|
date_to=f"{date_str}T23:59:59",
|
|
limit=30
|
|
)
|
|
|
|
assert result_to_only is not None
|
|
|
|
|
|
def test_full_datetime_preservation():
|
|
"""Verify that dates now include full timestamps (YYYY-MM-DD HH:MM:SS)"""
|
|
|
|
# Test with pandas return type
|
|
result_pandas = scrape_property(
|
|
location="San Diego, CA",
|
|
listing_type="sold",
|
|
past_days=30,
|
|
limit=10
|
|
)
|
|
|
|
assert result_pandas is not None and len(result_pandas) > 0
|
|
|
|
# Check that date fields contain time information
|
|
if len(result_pandas) > 0:
|
|
for idx in range(min(3, len(result_pandas))):
|
|
# Check last_sold_date
|
|
sold_date = result_pandas.iloc[idx]["last_sold_date"]
|
|
if pd.notna(sold_date):
|
|
sold_date_str = str(sold_date)
|
|
# Should contain time (HH:MM:SS), not just date
|
|
assert " " in sold_date_str or "T" in sold_date_str, \
|
|
f"Date should include time component: {sold_date_str}"
|
|
|
|
# Test with pydantic return type
|
|
result_pydantic = scrape_property(
|
|
location="Los Angeles, CA",
|
|
listing_type="for_sale",
|
|
past_days=7,
|
|
limit=10,
|
|
return_type="pydantic"
|
|
)
|
|
|
|
assert result_pydantic is not None and len(result_pydantic) > 0
|
|
|
|
# Verify Property objects have datetime objects with time info
|
|
for prop in result_pydantic[:3]:
|
|
if prop.list_date:
|
|
# Should be a datetime object, not just a date
|
|
assert hasattr(prop.list_date, 'hour'), "list_date should be a datetime with time"
|
|
|
|
|
|
def test_beds_filtering():
|
|
"""Test bedroom filtering with beds_min and beds_max"""
|
|
|
|
result = scrape_property(
|
|
location="Atlanta, GA",
|
|
listing_type="for_sale",
|
|
beds_min=2,
|
|
beds_max=4,
|
|
limit=50
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify all properties have 2-4 bedrooms
|
|
for idx in range(min(10, len(result))):
|
|
beds = result.iloc[idx]["beds"]
|
|
if pd.notna(beds):
|
|
assert 2 <= beds <= 4, f"Property should have 2-4 beds, got {beds}"
|
|
|
|
# Test beds_min only
|
|
result_min = scrape_property(
|
|
location="Denver, CO",
|
|
listing_type="for_sale",
|
|
beds_min=3,
|
|
limit=30
|
|
)
|
|
|
|
assert result_min is not None
|
|
|
|
# Test beds_max only
|
|
result_max = scrape_property(
|
|
location="Seattle, WA",
|
|
listing_type="for_sale",
|
|
beds_max=2,
|
|
limit=30
|
|
)
|
|
|
|
assert result_max is not None
|
|
|
|
|
|
def test_baths_filtering():
|
|
"""Test bathroom filtering with baths_min and baths_max"""
|
|
|
|
result = scrape_property(
|
|
location="Miami, FL",
|
|
listing_type="for_sale",
|
|
baths_min=2.0,
|
|
baths_max=3.5,
|
|
limit=50
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify bathrooms are within range
|
|
for idx in range(min(10, len(result))):
|
|
full_baths = result.iloc[idx]["full_baths"]
|
|
half_baths = result.iloc[idx]["half_baths"]
|
|
|
|
if pd.notna(full_baths):
|
|
total_baths = float(full_baths) + (float(half_baths) * 0.5 if pd.notna(half_baths) else 0)
|
|
# Allow some tolerance as API might calculate differently
|
|
if total_baths > 0:
|
|
assert total_baths >= 1.5, f"Baths should be >= 2.0, got {total_baths}"
|
|
|
|
|
|
def test_sqft_filtering():
|
|
"""Test square footage filtering"""
|
|
|
|
result = scrape_property(
|
|
location="Portland, OR",
|
|
listing_type="for_sale",
|
|
sqft_min=1000,
|
|
sqft_max=2500,
|
|
limit=50
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify sqft is within range
|
|
for idx in range(min(10, len(result))):
|
|
sqft = result.iloc[idx]["sqft"]
|
|
if pd.notna(sqft) and sqft > 0:
|
|
assert 1000 <= sqft <= 2500, f"Sqft should be 1000-2500, got {sqft}"
|
|
|
|
|
|
def test_price_filtering():
|
|
"""Test price range filtering"""
|
|
|
|
result = scrape_property(
|
|
location="Charlotte, NC",
|
|
listing_type="for_sale",
|
|
price_min=200000,
|
|
price_max=500000,
|
|
limit=50
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify prices are within range
|
|
for idx in range(min(15, len(result))):
|
|
price = result.iloc[idx]["list_price"]
|
|
if pd.notna(price) and price > 0:
|
|
assert 200000 <= price <= 500000, f"Price should be $200k-$500k, got ${price}"
|
|
|
|
|
|
def test_lot_sqft_filtering():
|
|
"""Test lot size filtering"""
|
|
|
|
result = scrape_property(
|
|
location="Scottsdale, AZ",
|
|
listing_type="for_sale",
|
|
lot_sqft_min=5000,
|
|
lot_sqft_max=15000,
|
|
limit=30
|
|
)
|
|
|
|
assert result is not None
|
|
# Results might be fewer if lot_sqft data is sparse
|
|
|
|
|
|
def test_year_built_filtering():
|
|
"""Test year built filtering"""
|
|
|
|
result = scrape_property(
|
|
location="Tampa, FL",
|
|
listing_type="for_sale",
|
|
year_built_min=2000,
|
|
year_built_max=2024,
|
|
limit=50
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify year_built is within range
|
|
for idx in range(min(10, len(result))):
|
|
year = result.iloc[idx]["year_built"]
|
|
if pd.notna(year) and year > 0:
|
|
assert 2000 <= year <= 2024, f"Year should be 2000-2024, got {year}"
|
|
|
|
|
|
def test_combined_filters():
|
|
"""Test multiple filters working together"""
|
|
|
|
result = scrape_property(
|
|
location="Nashville, TN",
|
|
listing_type="for_sale",
|
|
beds_min=3,
|
|
baths_min=2.0,
|
|
sqft_min=1500,
|
|
price_min=250000,
|
|
price_max=600000,
|
|
year_built_min=1990,
|
|
limit=30
|
|
)
|
|
|
|
assert result is not None
|
|
|
|
# If we get results, verify they meet ALL criteria
|
|
if len(result) > 0:
|
|
for idx in range(min(5, len(result))):
|
|
row = result.iloc[idx]
|
|
|
|
# Check beds
|
|
if pd.notna(row["beds"]):
|
|
assert row["beds"] >= 3, f"Beds should be >= 3, got {row['beds']}"
|
|
|
|
# Check sqft
|
|
if pd.notna(row["sqft"]) and row["sqft"] > 0:
|
|
assert row["sqft"] >= 1500, f"Sqft should be >= 1500, got {row['sqft']}"
|
|
|
|
# Check price
|
|
if pd.notna(row["list_price"]) and row["list_price"] > 0:
|
|
assert 250000 <= row["list_price"] <= 600000, \
|
|
f"Price should be $250k-$600k, got ${row['list_price']}"
|
|
|
|
# Check year
|
|
if pd.notna(row["year_built"]) and row["year_built"] > 0:
|
|
assert row["year_built"] >= 1990, \
|
|
f"Year should be >= 1990, got {row['year_built']}"
|
|
|
|
|
|
def test_sorting_by_price():
|
|
"""Test sorting by list_price with actual sort order validation"""
|
|
|
|
# Sort ascending (cheapest first) with multi-page limit to test concatenation
|
|
result_asc = scrape_property(
|
|
location="Orlando, FL",
|
|
listing_type="for_sale",
|
|
sort_by="list_price",
|
|
sort_direction="asc",
|
|
limit=250 # Multi-page to test concatenation logic
|
|
)
|
|
|
|
assert result_asc is not None and len(result_asc) > 0
|
|
|
|
# Verify ascending sort order (allow for None/NA values at the end)
|
|
prices_asc = result_asc["list_price"].dropna().tolist()
|
|
assert len(prices_asc) > 0, "No properties with prices found"
|
|
assert prices_asc == sorted(prices_asc), f"Prices not in ascending order: {prices_asc[:10]}"
|
|
|
|
# Sort descending (most expensive first)
|
|
result_desc = scrape_property(
|
|
location="San Antonio, TX",
|
|
listing_type="for_sale",
|
|
sort_by="list_price",
|
|
sort_direction="desc",
|
|
limit=250 # Multi-page to test concatenation logic
|
|
)
|
|
|
|
assert result_desc is not None and len(result_desc) > 0
|
|
|
|
# Verify descending sort order (allow for None/NA values at the end)
|
|
prices_desc = result_desc["list_price"].dropna().tolist()
|
|
assert len(prices_desc) > 0, "No properties with prices found"
|
|
assert prices_desc == sorted(prices_desc, reverse=True), f"Prices not in descending order: {prices_desc[:10]}"
|
|
|
|
|
|
def test_sorting_by_date():
|
|
"""Test sorting by list_date with actual sort order validation"""
|
|
|
|
# Test descending (newest first) with multi-page limit
|
|
result_desc = scrape_property(
|
|
location="Columbus, OH",
|
|
listing_type="for_sale",
|
|
sort_by="list_date",
|
|
sort_direction="desc", # Newest first
|
|
limit=250 # Multi-page to test concatenation logic
|
|
)
|
|
|
|
assert result_desc is not None and len(result_desc) > 0
|
|
|
|
# Verify descending sort order (allow for None/NA values at the end)
|
|
dates_desc = result_desc["list_date"].dropna().tolist()
|
|
assert len(dates_desc) > 0, "No properties with dates found"
|
|
assert dates_desc == sorted(dates_desc, reverse=True), f"Dates not in descending order (newest first): {dates_desc[:10]}"
|
|
|
|
# Test ascending (oldest first)
|
|
result_asc = scrape_property(
|
|
location="Columbus, OH",
|
|
listing_type="for_sale",
|
|
sort_by="list_date",
|
|
sort_direction="asc", # Oldest first
|
|
limit=250
|
|
)
|
|
|
|
assert result_asc is not None and len(result_asc) > 0
|
|
|
|
# Verify ascending sort order
|
|
dates_asc = result_asc["list_date"].dropna().tolist()
|
|
assert len(dates_asc) > 0, "No properties with dates found"
|
|
assert dates_asc == sorted(dates_asc), f"Dates not in ascending order (oldest first): {dates_asc[:10]}"
|
|
|
|
|
|
def test_sorting_by_sqft():
|
|
"""Test sorting by square footage with actual sort order validation"""
|
|
|
|
# Test descending (largest first) with multi-page limit
|
|
result_desc = scrape_property(
|
|
location="Indianapolis, IN",
|
|
listing_type="for_sale",
|
|
sort_by="sqft",
|
|
sort_direction="desc", # Largest first
|
|
limit=250 # Multi-page to test concatenation logic
|
|
)
|
|
|
|
assert result_desc is not None and len(result_desc) > 0
|
|
|
|
# Verify descending sort order (allow for None/NA values at the end)
|
|
sqfts_desc = result_desc["sqft"].dropna().tolist()
|
|
assert len(sqfts_desc) > 0, "No properties with sqft found"
|
|
assert sqfts_desc == sorted(sqfts_desc, reverse=True), f"Square footages not in descending order: {sqfts_desc[:10]}"
|
|
|
|
# Test ascending (smallest first)
|
|
result_asc = scrape_property(
|
|
location="Indianapolis, IN",
|
|
listing_type="for_sale",
|
|
sort_by="sqft",
|
|
sort_direction="asc", # Smallest first
|
|
limit=250
|
|
)
|
|
|
|
assert result_asc is not None and len(result_asc) > 0
|
|
|
|
# Verify ascending sort order
|
|
sqfts_asc = result_asc["sqft"].dropna().tolist()
|
|
assert len(sqfts_asc) > 0, "No properties with sqft found"
|
|
assert sqfts_asc == sorted(sqfts_asc), f"Square footages not in ascending order: {sqfts_asc[:10]}"
|
|
|
|
|
|
def test_filter_validation_errors():
|
|
"""Test that validation catches invalid parameters"""
|
|
import pytest
|
|
|
|
# Test: beds_min > beds_max should raise ValueError
|
|
with pytest.raises(ValueError, match="beds_min.*cannot be greater than.*beds_max"):
|
|
scrape_property(
|
|
location="Boston, MA",
|
|
listing_type="for_sale",
|
|
beds_min=5,
|
|
beds_max=2,
|
|
limit=10
|
|
)
|
|
|
|
# Test: invalid datetime format should raise exception
|
|
with pytest.raises(Exception): # InvalidDate
|
|
scrape_property(
|
|
location="Boston, MA",
|
|
listing_type="for_sale",
|
|
datetime_from="not-a-valid-datetime",
|
|
limit=10
|
|
)
|
|
|
|
# Test: invalid sort_by value should raise ValueError
|
|
with pytest.raises(ValueError, match="Invalid sort_by"):
|
|
scrape_property(
|
|
location="Boston, MA",
|
|
listing_type="for_sale",
|
|
sort_by="invalid_field",
|
|
limit=10
|
|
)
|
|
|
|
# Test: invalid sort_direction should raise ValueError
|
|
with pytest.raises(ValueError, match="Invalid sort_direction"):
|
|
scrape_property(
|
|
location="Boston, MA",
|
|
listing_type="for_sale",
|
|
sort_by="list_price",
|
|
sort_direction="invalid",
|
|
limit=10
|
|
)
|
|
|
|
|
|
def test_backward_compatibility():
|
|
"""Ensure old parameters still work as expected"""
|
|
|
|
# Test past_days still works
|
|
result_past_days = scrape_property(
|
|
location="Las Vegas, NV",
|
|
listing_type="sold",
|
|
past_days=30,
|
|
limit=20
|
|
)
|
|
|
|
assert result_past_days is not None and len(result_past_days) > 0
|
|
|
|
# Test date_from/date_to still work
|
|
result_date_range = scrape_property(
|
|
location="Memphis, TN",
|
|
listing_type="sold",
|
|
date_from="2024-01-01",
|
|
date_to="2024-03-31",
|
|
limit=20
|
|
)
|
|
|
|
assert result_date_range is not None
|
|
|
|
# Test property_type still works
|
|
result_property_type = scrape_property(
|
|
location="Louisville, KY",
|
|
listing_type="for_sale",
|
|
property_type=["single_family"],
|
|
limit=20
|
|
)
|
|
|
|
assert result_property_type is not None and len(result_property_type) > 0
|
|
|
|
# Test foreclosure still works
|
|
result_foreclosure = scrape_property(
|
|
location="Detroit, MI",
|
|
listing_type="for_sale",
|
|
foreclosure=True,
|
|
limit=15
|
|
)
|
|
|
|
assert result_foreclosure is not None
|
|
|
|
|
|
def test_last_status_change_date_field():
|
|
"""Test that last_status_change_date field is present and has hour-level precision"""
|
|
from datetime import datetime
|
|
|
|
# Test 1: Field is present in SOLD listings
|
|
result_sold = scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="sold",
|
|
past_days=30,
|
|
limit=20
|
|
)
|
|
|
|
assert result_sold is not None and len(result_sold) > 0
|
|
|
|
# Check that last_status_change_date column exists
|
|
assert "last_status_change_date" in result_sold.columns, \
|
|
"last_status_change_date column should be present in results"
|
|
|
|
# Check that at least some properties have this field populated
|
|
has_status_change_date = False
|
|
for idx in range(min(10, len(result_sold))):
|
|
status_change_date_str = result_sold.iloc[idx]["last_status_change_date"]
|
|
if pd.notna(status_change_date_str):
|
|
has_status_change_date = True
|
|
# Verify it has hour-level precision (includes time)
|
|
assert " " in str(status_change_date_str) or "T" in str(status_change_date_str), \
|
|
f"last_status_change_date should include time component: {status_change_date_str}"
|
|
break
|
|
|
|
# Note: It's possible some properties don't have this field, so we just verify it exists
|
|
# assert has_status_change_date, "At least some properties should have last_status_change_date"
|
|
|
|
# Test 2: Field is present in PENDING listings
|
|
result_pending = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="pending",
|
|
past_days=30,
|
|
limit=20
|
|
)
|
|
|
|
assert result_pending is not None
|
|
# Only check columns if we have results (empty DataFrame has no columns)
|
|
if len(result_pending) > 0:
|
|
assert "last_status_change_date" in result_pending.columns, \
|
|
"last_status_change_date column should be present in PENDING results"
|
|
|
|
# Test 3: Field is present in FOR_SALE listings
|
|
result_for_sale = scrape_property(
|
|
location="Austin, TX",
|
|
listing_type="for_sale",
|
|
past_days=7,
|
|
limit=20
|
|
)
|
|
|
|
assert result_for_sale is not None and len(result_for_sale) > 0
|
|
assert "last_status_change_date" in result_for_sale.columns, \
|
|
"last_status_change_date column should be present in FOR_SALE results"
|
|
|
|
|
|
def test_last_status_change_date_precision_enhancement():
|
|
"""Test that pending_date and last_sold_date use hour-precision from last_status_change_date"""
|
|
from datetime import datetime
|
|
|
|
# Test with pydantic return type to examine actual Property objects
|
|
# Use a larger time window to ensure we get some results
|
|
result_sold = scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="sold",
|
|
past_days=90,
|
|
limit=30,
|
|
return_type="pydantic"
|
|
)
|
|
|
|
assert result_sold is not None
|
|
|
|
# Only run assertions if we have data (data availability may vary)
|
|
if len(result_sold) > 0:
|
|
# Check that dates have hour-level precision (not just date)
|
|
for prop in result_sold[:10]:
|
|
# If both last_sold_date and last_status_change_date exist
|
|
if prop.last_sold_date and prop.last_status_change_date:
|
|
# Both should be datetime objects with time info
|
|
assert hasattr(prop.last_sold_date, 'hour'), \
|
|
"last_sold_date should have hour precision"
|
|
assert hasattr(prop.last_status_change_date, 'hour'), \
|
|
"last_status_change_date should have hour precision"
|
|
|
|
# If they're on the same day, the processor should have used
|
|
# last_status_change_date to provide hour precision for last_sold_date
|
|
if prop.last_sold_date.date() == prop.last_status_change_date.date():
|
|
# They should have the same timestamp (hour/minute/second)
|
|
assert prop.last_sold_date == prop.last_status_change_date, \
|
|
"last_sold_date should match last_status_change_date for hour precision"
|
|
|
|
# Test with PENDING listings
|
|
result_pending = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="pending",
|
|
past_days=90,
|
|
limit=30,
|
|
return_type="pydantic"
|
|
)
|
|
|
|
assert result_pending is not None
|
|
|
|
# Only run assertions if we have data
|
|
if len(result_pending) > 0:
|
|
for prop in result_pending[:10]:
|
|
# If both pending_date and last_status_change_date exist
|
|
if prop.pending_date and prop.last_status_change_date:
|
|
assert hasattr(prop.pending_date, 'hour'), \
|
|
"pending_date should have hour precision"
|
|
assert hasattr(prop.last_status_change_date, 'hour'), \
|
|
"last_status_change_date should have hour precision"
|
|
|
|
# If they're on the same day, pending_date should use the time from last_status_change_date
|
|
if prop.pending_date.date() == prop.last_status_change_date.date():
|
|
assert prop.pending_date == prop.last_status_change_date, \
|
|
"pending_date should match last_status_change_date for hour precision"
|
|
|
|
|
|
def test_last_status_change_date_filtering_fallback():
|
|
"""Test that filtering falls back to last_status_change_date when primary date is missing"""
|
|
from datetime import datetime, timedelta
|
|
|
|
# This test verifies that if a property doesn't have the primary date field
|
|
# (e.g., pending_date for PENDING listings), it can still be filtered using
|
|
# last_status_change_date as a fallback
|
|
|
|
# Test with PENDING properties using past_hours (client-side filtering)
|
|
result_pending = scrape_property(
|
|
location="Miami, FL",
|
|
listing_type="pending",
|
|
past_hours=72,
|
|
limit=30
|
|
)
|
|
|
|
assert result_pending is not None
|
|
|
|
# If we get results, verify they have either pending_date or last_status_change_date
|
|
if len(result_pending) > 0:
|
|
cutoff_time = datetime.now() - timedelta(hours=72)
|
|
|
|
for idx in range(min(5, len(result_pending))):
|
|
pending_date_str = result_pending.iloc[idx]["pending_date"]
|
|
status_change_date_str = result_pending.iloc[idx]["last_status_change_date"]
|
|
|
|
# At least one of these should be present for filtering to work
|
|
has_date = pd.notna(pending_date_str) or pd.notna(status_change_date_str)
|
|
|
|
# Note: Contingent properties without dates are allowed, so we don't assert here
|
|
# The test just verifies the field exists and can be used
|
|
|
|
|
|
def test_last_status_change_date_hour_filtering():
|
|
"""Test that past_hours filtering works correctly with last_status_change_date for PENDING/SOLD"""
|
|
from datetime import datetime, timedelta
|
|
|
|
# Test with SOLD properties
|
|
result_sold = scrape_property(
|
|
location="Atlanta, GA",
|
|
listing_type="sold",
|
|
past_hours=48,
|
|
limit=30
|
|
)
|
|
|
|
assert result_sold is not None
|
|
|
|
if len(result_sold) > 0:
|
|
cutoff_time = datetime.now() - timedelta(hours=48)
|
|
|
|
# Verify that results are within 48 hours
|
|
for idx in range(min(5, len(result_sold))):
|
|
sold_date_str = result_sold.iloc[idx]["last_sold_date"]
|
|
if pd.notna(sold_date_str):
|
|
try:
|
|
sold_date = datetime.strptime(str(sold_date_str), "%Y-%m-%d %H:%M:%S")
|
|
# Should be within 48 hours with hour-level precision
|
|
assert sold_date >= cutoff_time, \
|
|
f"SOLD property last_sold_date {sold_date} should be within 48 hours of {cutoff_time}"
|
|
except (ValueError, TypeError):
|
|
pass # Skip if parsing fails
|
|
|
|
# Test with PENDING properties
|
|
result_pending = scrape_property(
|
|
location="Denver, CO",
|
|
listing_type="pending",
|
|
past_hours=48,
|
|
limit=30
|
|
)
|
|
|
|
assert result_pending is not None
|
|
|
|
if len(result_pending) > 0:
|
|
cutoff_time = datetime.now() - timedelta(hours=48)
|
|
|
|
# Verify that results are within 48 hours
|
|
for idx in range(min(5, len(result_pending))):
|
|
pending_date_str = result_pending.iloc[idx]["pending_date"]
|
|
if pd.notna(pending_date_str):
|
|
try:
|
|
pending_date = datetime.strptime(str(pending_date_str), "%Y-%m-%d %H:%M:%S")
|
|
# Should be within 48 hours with hour-level precision
|
|
assert pending_date >= cutoff_time, \
|
|
f"PENDING property pending_date {pending_date} should be within 48 hours of {cutoff_time}"
|
|
except (ValueError, TypeError):
|
|
pass # Skip if parsing fails
|
|
|
|
|
|
def test_exclude_pending_with_raw_data():
|
|
"""Test that exclude_pending parameter works correctly with return_type='raw'"""
|
|
|
|
# Query for sale properties with exclude_pending=True and raw data
|
|
result = scrape_property(
|
|
location="Phoenix, AZ",
|
|
listing_type="for_sale",
|
|
exclude_pending=True,
|
|
return_type="raw",
|
|
limit=50
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify that no pending or contingent properties are in the results
|
|
for prop in result:
|
|
flags = prop.get('flags', {})
|
|
is_pending = flags.get('is_pending', False)
|
|
is_contingent = flags.get('is_contingent', False)
|
|
|
|
assert not is_pending, f"Property {prop.get('property_id')} should not be pending when exclude_pending=True"
|
|
assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent when exclude_pending=True"
|
|
|
|
|
|
def test_mls_only_with_raw_data():
|
|
"""Test that mls_only parameter works correctly with return_type='raw'"""
|
|
|
|
# Query with mls_only=True and raw data
|
|
result = scrape_property(
|
|
location="Dallas, TX",
|
|
listing_type="for_sale",
|
|
mls_only=True,
|
|
return_type="raw",
|
|
limit=50
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify that all properties have MLS IDs (stored in source.id)
|
|
for prop in result:
|
|
source = prop.get('source', {})
|
|
mls_id = source.get('id') if source else None
|
|
|
|
assert mls_id is not None and mls_id != "", \
|
|
f"Property {prop.get('property_id')} should have an MLS ID (source.id) when mls_only=True, got: {mls_id}"
|
|
|
|
|
|
def test_combined_filters_with_raw_data():
|
|
"""Test that both exclude_pending and mls_only work together with return_type='raw'"""
|
|
|
|
# Query with both filters enabled and raw data
|
|
result = scrape_property(
|
|
location="Austin, TX",
|
|
listing_type="for_sale",
|
|
exclude_pending=True,
|
|
mls_only=True,
|
|
return_type="raw",
|
|
limit=30
|
|
)
|
|
|
|
assert result is not None and len(result) > 0
|
|
|
|
# Verify both filters are applied
|
|
for prop in result:
|
|
# Check exclude_pending filter
|
|
flags = prop.get('flags', {})
|
|
is_pending = flags.get('is_pending', False)
|
|
is_contingent = flags.get('is_contingent', False)
|
|
|
|
assert not is_pending, f"Property {prop.get('property_id')} should not be pending"
|
|
assert not is_contingent, f"Property {prop.get('property_id')} should not be contingent"
|
|
|
|
# Check mls_only filter
|
|
source = prop.get('source', {})
|
|
mls_id = source.get('id') if source else None
|
|
|
|
assert mls_id is not None and mls_id != "", \
|
|
f"Property {prop.get('property_id')} should have an MLS ID (source.id)"
|
|
|
|
|
|
def test_updated_since_filtering():
|
|
"""Test the updated_since parameter for filtering by last_update_date"""
|
|
from datetime import datetime, timedelta
|
|
|
|
# Test 1: Filter by last update in past 10 minutes (user's example)
|
|
cutoff_time = datetime.now() - timedelta(minutes=10)
|
|
result_10min = scrape_property(
|
|
location="California",
|
|
updated_since=cutoff_time,
|
|
sort_by="last_update_date",
|
|
sort_direction="desc",
|
|
limit=100
|
|
)
|
|
|
|
assert result_10min is not None
|
|
print(f"\n10-minute window returned {len(result_10min)} properties")
|
|
|
|
# Test 2: Verify all results have last_update_date within range
|
|
if len(result_10min) > 0:
|
|
for idx in range(min(10, len(result_10min))):
|
|
update_date_str = result_10min.iloc[idx]["last_update_date"]
|
|
if pd.notna(update_date_str):
|
|
try:
|
|
# Handle timezone-aware datetime strings
|
|
date_str = str(update_date_str)
|
|
if '+' in date_str or date_str.endswith('Z'):
|
|
# Remove timezone for comparison with naive cutoff_time
|
|
date_str = date_str.replace('+00:00', '').replace('Z', '')
|
|
update_date = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
|
|
|
|
assert update_date >= cutoff_time, \
|
|
f"Property last_update_date {update_date} should be >= {cutoff_time}"
|
|
print(f"Property {idx}: last_update_date = {update_date} (valid)")
|
|
except (ValueError, TypeError) as e:
|
|
print(f"Warning: Could not parse date {update_date_str}: {e}")
|
|
|
|
# Test 3: Compare different time windows
|
|
result_1hour = scrape_property(
|
|
location="California",
|
|
updated_since=datetime.now() - timedelta(hours=1),
|
|
limit=50
|
|
)
|
|
|
|
result_24hours = scrape_property(
|
|
location="California",
|
|
updated_since=datetime.now() - timedelta(hours=24),
|
|
limit=50
|
|
)
|
|
|
|
print(f"1-hour window: {len(result_1hour)} properties")
|
|
print(f"24-hour window: {len(result_24hours)} properties")
|
|
|
|
# Longer time window should return same or more results
|
|
if len(result_1hour) > 0 and len(result_24hours) > 0:
|
|
assert len(result_1hour) <= len(result_24hours), \
|
|
"1-hour filter should return <= 24-hour results"
|
|
|
|
# Test 4: Verify sorting works with filtering
|
|
if len(result_10min) > 1:
|
|
# Get non-null dates
|
|
dates = []
|
|
for idx in range(len(result_10min)):
|
|
date_str = result_10min.iloc[idx]["last_update_date"]
|
|
if pd.notna(date_str):
|
|
try:
|
|
# Handle timezone-aware datetime strings
|
|
clean_date_str = str(date_str)
|
|
if '+' in clean_date_str or clean_date_str.endswith('Z'):
|
|
clean_date_str = clean_date_str.replace('+00:00', '').replace('Z', '')
|
|
dates.append(datetime.strptime(clean_date_str, "%Y-%m-%d %H:%M:%S"))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
if len(dates) > 1:
|
|
# Check if sorted descending
|
|
for i in range(len(dates) - 1):
|
|
assert dates[i] >= dates[i + 1], \
|
|
f"Results should be sorted by last_update_date descending: {dates[i]} >= {dates[i+1]}"
|
|
|
|
|
|
def test_updated_since_optimization():
|
|
"""Test that updated_since optimization works (auto-sort + early termination)"""
|
|
from datetime import datetime, timedelta
|
|
import time
|
|
|
|
# Test 1: Verify auto-sort is applied when using updated_since without explicit sort
|
|
start_time = time.time()
|
|
result = scrape_property(
|
|
location="California",
|
|
updated_since=datetime.now() - timedelta(minutes=5),
|
|
# NO sort_by specified - should auto-apply sort_by="last_update_date"
|
|
limit=50
|
|
)
|
|
elapsed_time = time.time() - start_time
|
|
|
|
print(f"\nAuto-sort test: {len(result)} properties in {elapsed_time:.2f}s")
|
|
|
|
# Should complete quickly due to early termination optimization (<5 seconds)
|
|
assert elapsed_time < 5.0, f"Query should be fast with optimization, took {elapsed_time:.2f}s"
|
|
|
|
# Verify results are sorted by last_update_date (proving auto-sort worked)
|
|
if len(result) > 1:
|
|
dates = []
|
|
for idx in range(min(10, len(result))):
|
|
date_str = result.iloc[idx]["last_update_date"]
|
|
if pd.notna(date_str):
|
|
try:
|
|
clean_date_str = str(date_str)
|
|
if '+' in clean_date_str or clean_date_str.endswith('Z'):
|
|
clean_date_str = clean_date_str.replace('+00:00', '').replace('Z', '')
|
|
dates.append(datetime.strptime(clean_date_str, "%Y-%m-%d %H:%M:%S"))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
if len(dates) > 1:
|
|
# Verify descending order (most recent first)
|
|
for i in range(len(dates) - 1):
|
|
assert dates[i] >= dates[i + 1], \
|
|
"Auto-applied sort should order by last_update_date descending"
|
|
|
|
print("Auto-sort optimization verified ✓")
|
|
|
|
|
|
def test_pending_date_optimization():
|
|
"""Test that PENDING + date filters get auto-sort and early termination"""
|
|
from datetime import datetime, timedelta
|
|
import time
|
|
|
|
# Test: Verify auto-sort is applied for PENDING with past_days
|
|
start_time = time.time()
|
|
result = scrape_property(
|
|
location="California",
|
|
listing_type="pending",
|
|
past_days=7,
|
|
# NO sort_by specified - should auto-apply sort_by="pending_date"
|
|
limit=50
|
|
)
|
|
elapsed_time = time.time() - start_time
|
|
|
|
print(f"\nPENDING auto-sort test: {len(result)} properties in {elapsed_time:.2f}s")
|
|
|
|
# Should complete quickly due to optimization (<10 seconds)
|
|
assert elapsed_time < 10.0, f"PENDING query should be fast with optimization, took {elapsed_time:.2f}s"
|
|
|
|
# Verify results are sorted by pending_date (proving auto-sort worked)
|
|
if len(result) > 1:
|
|
dates = []
|
|
for idx in range(min(10, len(result))):
|
|
date_str = result.iloc[idx]["pending_date"]
|
|
if pd.notna(date_str):
|
|
try:
|
|
clean_date_str = str(date_str)
|
|
if '+' in clean_date_str or clean_date_str.endswith('Z'):
|
|
clean_date_str = clean_date_str.replace('+00:00', '').replace('Z', '')
|
|
dates.append(datetime.strptime(clean_date_str, "%Y-%m-%d %H:%M:%S"))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
if len(dates) > 1:
|
|
# Verify descending order (most recent first)
|
|
for i in range(len(dates) - 1):
|
|
assert dates[i] >= dates[i + 1], \
|
|
"PENDING auto-applied sort should order by pending_date descending"
|
|
|
|
print("PENDING optimization verified ✓")
|
|
|
|
|
|
def test_basic_last_update_date():
|
|
from datetime import datetime, timedelta
|
|
|
|
# Test with naive datetime (treated as local time)
|
|
now = datetime.now()
|
|
|
|
properties = scrape_property(
|
|
"California",
|
|
updated_since=now - timedelta(minutes=10),
|
|
sort_by="last_update_date",
|
|
sort_direction="desc"
|
|
)
|
|
|
|
# Convert now to timezone-aware for comparison with UTC dates in DataFrame
|
|
now_utc = now.astimezone(tz=pytz.timezone("UTC"))
|
|
|
|
# Check all last_update_date values are <= now
|
|
assert (properties["last_update_date"] <= now_utc).all()
|
|
|
|
# Verify we got some results
|
|
assert len(properties) > 0
|
|
|
|
|
|
def test_timezone_aware_last_update_date():
|
|
"""Test that timezone-aware datetimes work correctly for updated_since"""
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
# Test with timezone-aware datetime (explicit UTC)
|
|
now_utc = datetime.now(timezone.utc)
|
|
|
|
properties = scrape_property(
|
|
"California",
|
|
updated_since=now_utc - timedelta(minutes=10),
|
|
sort_by="last_update_date",
|
|
sort_direction="desc"
|
|
)
|
|
|
|
# Check all last_update_date values are <= now
|
|
assert (properties["last_update_date"] <= now_utc).all()
|
|
|
|
# Verify we got some results
|
|
assert len(properties) > 0
|
|
|
|
|
|
def test_timezone_handling_date_range():
|
|
"""Test timezone handling for date_from and date_to parameters"""
|
|
from datetime import datetime, timedelta
|
|
|
|
# Test with naive datetimes for date range (PENDING properties)
|
|
now = datetime.now()
|
|
three_days_ago = now - timedelta(days=3)
|
|
|
|
properties = scrape_property(
|
|
"California",
|
|
listing_type="pending",
|
|
date_from=three_days_ago,
|
|
date_to=now
|
|
)
|
|
|
|
# Verify we got results and they're within the date range
|
|
if len(properties) > 0:
|
|
# Convert now to UTC for comparison
|
|
now_utc = now.astimezone(tz=pytz.timezone("UTC"))
|
|
assert (properties["pending_date"] <= now_utc).all()
|
|
|