Fix timezone handling for all date parameters

- Treat naive datetimes as local time and convert to UTC automatically
- Support both naive and timezone-aware datetimes for updated_since, date_from, date_to
- Fix timezone comparison bug that caused incorrect filtering with naive datetimes
- Update documentation with clear timezone handling examples
- Add comprehensive timezone tests for naive and aware datetimes
- Bump version to 0.8.3

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Zachary Hampton
2025-11-11 17:40:21 -08:00
parent 7065f8a0d4
commit 9b61a89c77
5 changed files with 133 additions and 24 deletions

View File

@@ -72,6 +72,8 @@ def scrape_property(
- date objects: date(2025, 1, 20) (day-level precision) - date objects: date(2025, 1, 20) (day-level precision)
- datetime objects: datetime(2025, 1, 20, 14, 30) (hour-level precision) - datetime objects: datetime(2025, 1, 20, 14, 30) (hour-level precision)
The precision is automatically detected based on the input format. The precision is automatically detected based on the input format.
Timezone handling: Naive datetimes are treated as local time and automatically converted to UTC.
Timezone-aware datetimes are converted to UTC. For best results, use timezone-aware datetimes.
:param foreclosure: If set, fetches only foreclosure listings. :param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending. :param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
@@ -80,7 +82,11 @@ def scrape_property(
New parameters: New parameters:
:param past_hours: Get properties in the last _ hours (requires client-side filtering). Accepts int or timedelta. :param past_hours: Get properties in the last _ hours (requires client-side filtering). Accepts int or timedelta.
:param updated_since: Filter by last_update_date (when property was last updated). Accepts datetime object or ISO 8601 string (client-side filtering) :param updated_since: Filter by last_update_date (when property was last updated). Accepts datetime object or ISO 8601 string (client-side filtering).
Timezone handling: Naive datetimes (like datetime.now()) are treated as local time and automatically converted to UTC.
Timezone-aware datetimes are converted to UTC. Examples:
- datetime.now() - uses your local timezone
- datetime.now(timezone.utc) - uses UTC explicitly
:param updated_in_past_hours: Filter by properties updated in the last _ hours. Accepts int or timedelta (client-side filtering) :param updated_in_past_hours: Filter by properties updated in the last _ hours. Accepts int or timedelta (client-side filtering)
:param beds_min, beds_max: Filter by number of bedrooms :param beds_min, beds_max: Filter by number of bedrooms
:param baths_min, baths_max: Filter by number of bathrooms :param baths_min, baths_max: Filter by number of bathrooms

View File

@@ -755,13 +755,14 @@ class RealtorScraper(Scraper):
if not homes: if not homes:
return homes return homes
from datetime import datetime, timedelta from datetime import datetime, timedelta, timezone
# Determine date range for last_update_date filtering # Determine date range for last_update_date filtering
date_range = None date_range = None
if self.updated_in_past_hours: if self.updated_in_past_hours:
cutoff_datetime = datetime.now() - timedelta(hours=self.updated_in_past_hours) # Use UTC now, strip timezone to match naive property dates
cutoff_datetime = (datetime.now(timezone.utc) - timedelta(hours=self.updated_in_past_hours)).replace(tzinfo=None)
date_range = {'type': 'since', 'date': cutoff_datetime} date_range = {'type': 'since', 'date': cutoff_datetime}
elif self.updated_since: elif self.updated_since:
try: try:
@@ -792,15 +793,19 @@ class RealtorScraper(Scraper):
def _get_date_range(self): def _get_date_range(self):
"""Get the date range for filtering based on instance parameters.""" """Get the date range for filtering based on instance parameters."""
from datetime import datetime, timedelta from datetime import datetime, timedelta, timezone
if self.last_x_days: if self.last_x_days:
cutoff_date = datetime.now() - timedelta(days=self.last_x_days) # Use UTC now, strip timezone to match naive property dates
cutoff_date = (datetime.now(timezone.utc) - timedelta(days=self.last_x_days)).replace(tzinfo=None)
return {'type': 'since', 'date': cutoff_date} return {'type': 'since', 'date': cutoff_date}
elif self.date_from and self.date_to: elif self.date_from and self.date_to:
try: try:
from_date = datetime.fromisoformat(self.date_from) # Parse and strip timezone to match naive property dates
to_date = datetime.fromisoformat(self.date_to) from_date_str = self.date_from.replace('Z', '+00:00') if self.date_from.endswith('Z') else self.date_from
to_date_str = self.date_to.replace('Z', '+00:00') if self.date_to.endswith('Z') else self.date_to
from_date = datetime.fromisoformat(from_date_str).replace(tzinfo=None)
to_date = datetime.fromisoformat(to_date_str).replace(tzinfo=None)
return {'type': 'range', 'from_date': from_date, 'to_date': to_date} return {'type': 'range', 'from_date': from_date, 'to_date': to_date}
except ValueError: except ValueError:
return None return None
@@ -865,7 +870,7 @@ class RealtorScraper(Scraper):
Returns: Returns:
bool: True if we should continue pagination, False to stop early bool: True if we should continue pagination, False to stop early
""" """
from datetime import datetime, timedelta from datetime import datetime, timedelta, timezone
# Check for last_update_date filters # Check for last_update_date filters
if (self.updated_since or self.updated_in_past_hours) and self.sort_by == "last_update_date": if (self.updated_since or self.updated_in_past_hours) and self.sort_by == "last_update_date":
@@ -882,11 +887,14 @@ class RealtorScraper(Scraper):
if self.updated_since: if self.updated_since:
try: try:
cutoff_datetime = datetime.fromisoformat(self.updated_since.replace('Z', '+00:00') if self.updated_since.endswith('Z') else self.updated_since) cutoff_datetime = datetime.fromisoformat(self.updated_since.replace('Z', '+00:00') if self.updated_since.endswith('Z') else self.updated_since)
# Strip timezone to match naive datetimes from _parse_date_value
cutoff_datetime = cutoff_datetime.replace(tzinfo=None)
date_range = {'type': 'since', 'date': cutoff_datetime} date_range = {'type': 'since', 'date': cutoff_datetime}
except ValueError: except ValueError:
return True return True
elif self.updated_in_past_hours: elif self.updated_in_past_hours:
cutoff_datetime = datetime.now() - timedelta(hours=self.updated_in_past_hours) # Use UTC now, strip timezone to match naive property dates
cutoff_datetime = (datetime.now(timezone.utc) - timedelta(hours=self.updated_in_past_hours)).replace(tzinfo=None)
date_range = {'type': 'since', 'date': cutoff_datetime} date_range = {'type': 'since', 'date': cutoff_datetime}
else: else:
return True return True
@@ -935,6 +943,8 @@ class RealtorScraper(Scraper):
def get_sort_key(home): def get_sort_key(home):
"""Extract the sort field value from a home (handles both dict and Property object).""" """Extract the sort field value from a home (handles both dict and Property object)."""
from datetime import datetime
if isinstance(home, dict): if isinstance(home, dict):
value = home.get(self.sort_by) value = home.get(self.sort_by)
else: else:
@@ -950,20 +960,23 @@ class RealtorScraper(Scraper):
if self.sort_by in ['list_date', 'sold_date', 'pending_date', 'last_update_date']: if self.sort_by in ['list_date', 'sold_date', 'pending_date', 'last_update_date']:
if isinstance(value, str): if isinstance(value, str):
try: try:
from datetime import datetime
# Handle timezone indicators # Handle timezone indicators
date_value = value date_value = value
if date_value.endswith('Z'): if date_value.endswith('Z'):
date_value = date_value[:-1] + '+00:00' date_value = date_value[:-1] + '+00:00'
parsed_date = datetime.fromisoformat(date_value) parsed_date = datetime.fromisoformat(date_value)
return (0, parsed_date) # Normalize to timezone-naive for consistent comparison
return 0, parsed_date.replace(tzinfo=None)
except (ValueError, AttributeError): except (ValueError, AttributeError):
# If parsing fails, treat as None # If parsing fails, treat as None
return (1, 0) if self.sort_direction == "desc" else (1, float('inf')) return (1, 0) if self.sort_direction == "desc" else (1, float('inf'))
return (0, value) # Handle datetime objects directly (normalize timezone)
if isinstance(value, datetime):
return 0, value.replace(tzinfo=None)
return 0, value
# For numeric fields, ensure we can compare # For numeric fields, ensure we can compare
return (0, value) return 0, value
# Sort the homes # Sort the homes
reverse = (self.sort_direction == "desc") reverse = (self.sort_direction == "desc")

View File

@@ -331,15 +331,26 @@ def validate_sort(sort_by: str | None, sort_direction: str | None = "desc") -> N
def convert_to_datetime_string(value) -> str | None: def convert_to_datetime_string(value) -> str | None:
""" """
Convert datetime object or string to ISO 8601 string format. Convert datetime object or string to ISO 8601 string format with UTC timezone.
Accepts: Accepts:
- datetime.datetime objects - datetime.datetime objects (naive or timezone-aware)
- datetime.date objects - Naive datetimes are treated as local time and converted to UTC
- Timezone-aware datetimes are converted to UTC
- datetime.date objects (treated as midnight UTC)
- ISO 8601 strings (returned as-is) - ISO 8601 strings (returned as-is)
- None (returns None) - None (returns None)
Returns ISO 8601 formatted string or None. Returns ISO 8601 formatted string with UTC timezone or None.
Examples:
>>> # Naive datetime (treated as local time)
>>> convert_to_datetime_string(datetime(2025, 1, 20, 14, 30))
'2025-01-20T22:30:00+00:00' # Assuming PST (UTC-8)
>>> # Timezone-aware datetime
>>> convert_to_datetime_string(datetime(2025, 1, 20, 14, 30, tzinfo=timezone.utc))
'2025-01-20T14:30:00+00:00'
""" """
if value is None: if value is None:
return None return None
@@ -349,13 +360,23 @@ def convert_to_datetime_string(value) -> str | None:
return value return value
# datetime.datetime object # datetime.datetime object
from datetime import datetime, date from datetime import datetime, date, timezone
if isinstance(value, datetime): if isinstance(value, datetime):
return value.isoformat() # Handle naive datetime - treat as local time and convert to UTC
if value.tzinfo is None:
# Convert naive datetime to aware local time, then to UTC
local_aware = value.astimezone()
utc_aware = local_aware.astimezone(timezone.utc)
return utc_aware.isoformat()
else:
# Already timezone-aware, convert to UTC
utc_aware = value.astimezone(timezone.utc)
return utc_aware.isoformat()
# datetime.date object (convert to datetime at midnight) # datetime.date object (convert to datetime at midnight UTC)
if isinstance(value, date): if isinstance(value, date):
return datetime.combine(value, datetime.min.time()).isoformat() utc_datetime = datetime.combine(value, datetime.min.time()).replace(tzinfo=timezone.utc)
return utc_datetime.isoformat()
raise ValueError( raise ValueError(
f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. " f"Invalid datetime value. Expected datetime object, date object, or ISO 8601 string. "

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.2" version = "0.8.3"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@@ -1,3 +1,5 @@
import pytz
from homeharvest import scrape_property, Property from homeharvest import scrape_property, Property
import pandas as pd import pandas as pd
@@ -1524,4 +1526,71 @@ def test_pending_date_optimization():
assert dates[i] >= dates[i + 1], \ assert dates[i] >= dates[i + 1], \
"PENDING auto-applied sort should order by pending_date descending" "PENDING auto-applied sort should order by pending_date descending"
print("PENDING optimization verified ✓") print("PENDING optimization verified ✓")
def test_basic_last_update_date():
from datetime import datetime, timedelta
# Test with naive datetime (treated as local time)
now = datetime.now()
properties = scrape_property(
"California",
updated_since=now - timedelta(minutes=10),
sort_by="last_update_date",
sort_direction="desc"
)
# Convert now to timezone-aware for comparison with UTC dates in DataFrame
now_utc = now.astimezone(tz=pytz.timezone("UTC"))
# Check all last_update_date values are <= now
assert (properties["last_update_date"] <= now_utc).all()
# Verify we got some results
assert len(properties) > 0
def test_timezone_aware_last_update_date():
"""Test that timezone-aware datetimes work correctly for updated_since"""
from datetime import datetime, timedelta, timezone
# Test with timezone-aware datetime (explicit UTC)
now_utc = datetime.now(timezone.utc)
properties = scrape_property(
"California",
updated_since=now_utc - timedelta(minutes=10),
sort_by="last_update_date",
sort_direction="desc"
)
# Check all last_update_date values are <= now
assert (properties["last_update_date"] <= now_utc).all()
# Verify we got some results
assert len(properties) > 0
def test_timezone_handling_date_range():
"""Test timezone handling for date_from and date_to parameters"""
from datetime import datetime, timedelta
# Test with naive datetimes for date range (PENDING properties)
now = datetime.now()
three_days_ago = now - timedelta(days=3)
properties = scrape_property(
"California",
listing_type="pending",
date_from=three_days_ago,
date_to=now
)
# Verify we got results and they're within the date range
if len(properties) > 0:
# Convert now to UTC for comparison
now_utc = now.astimezone(tz=pytz.timezone("UTC"))
assert (properties["pending_date"] <= now_utc).all()