- add all new data fields

This commit is contained in:
Zachary Hampton
2025-07-15 13:21:48 -07:00
parent 79082090cb
commit 6c6243eba4
6 changed files with 49256 additions and 38 deletions

View File

@@ -1,7 +1,8 @@
from __future__ import annotations
from enum import Enum
from typing import Optional
from pydantic import BaseModel, computed_field
from typing import Optional, Any
from datetime import datetime
from pydantic import BaseModel, computed_field, HttpUrl, Field
class ReturnType(Enum):
@@ -72,9 +73,15 @@ class Address(BaseModel):
full_line: str | None = None
street: str | None = None
unit: str | None = None
city: str | None = None
state: str | None = None
zip: str | None = None
city: str | None = Field(None, description="The name of the city")
state: str | None = Field(None, description="The name of the state")
zip: str | None = Field(None, description="zip code")
# Additional address fields from GraphQL
street_direction: str | None = None
street_number: str | None = None
street_name: str | None = None
street_suffix: str | None = None
@computed_field
@property
@@ -102,19 +109,23 @@ class Address(BaseModel):
class Description(BaseModel):
primary_photo: str | None = None
alt_photos: list[str] | None = None
primary_photo: HttpUrl | None = None
alt_photos: list[HttpUrl] | None = None
style: PropertyType | None = None
beds: int | None = None
baths_full: int | None = None
baths_half: int | None = None
sqft: int | None = None
lot_sqft: int | None = None
sold_price: int | None = None
year_built: int | None = None
garage: float | None = None
stories: int | None = None
beds: int | None = Field(None, description="Total number of bedrooms")
baths_full: int | None = Field(None, description="Total number of full bathrooms (4 parts: Sink, Shower, Bathtub and Toilet)")
baths_half: int | None = Field(None, description="Total number of 1/2 bathrooms (2 parts: Usually Sink and Toilet)")
sqft: int | None = Field(None, description="Square footage of the Home")
lot_sqft: int | None = Field(None, description="Lot square footage")
sold_price: int | None = Field(None, description="Sold price of home")
year_built: int | None = Field(None, description="The year the building/home was built")
garage: float | None = Field(None, description="Number of garage spaces")
stories: int | None = Field(None, description="Number of stories in the building")
text: str | None = None
# Additional description fields
name: str | None = None
type: str | None = None
class AgentPhone(BaseModel):
@@ -125,7 +136,7 @@ class AgentPhone(BaseModel):
class Entity(BaseModel):
name: str
name: str | None = None # Make name optional since it can be None
uuid: str | None = None
@@ -160,29 +171,30 @@ class Advertisers(BaseModel):
class Property(BaseModel):
property_url: str
property_id: str
property_url: HttpUrl
property_id: str = Field(..., description="Unique Home identifier also known as property id")
#: allows_cats: bool
#: allows_dogs: bool
listing_id: str | None = None
permalink: str | None = None
mls: str | None = None
mls_id: str | None = None
status: str | None = None
status: str | None = Field(None, description="Listing status: for_sale, for_rent, sold, off_market, active (New Home Subdivisions), other (if none of the above conditions were met)")
address: Address | None = None
list_price: int | None = None
list_price: int | None = Field(None, description="The current price of the Home")
list_price_min: int | None = None
list_price_max: int | None = None
list_date: str | None = None
pending_date: str | None = None
last_sold_date: str | None = None
list_date: datetime | None = Field(None, description="The time this Home entered Move system")
pending_date: datetime | None = Field(None, description="The date listing went into pending state")
last_sold_date: datetime | None = Field(None, description="Last time the Home was sold")
prc_sqft: int | None = None
new_construction: bool | None = None
hoa_fee: int | None = None
days_on_mls: int | None = None
new_construction: bool | None = Field(None, description="Search for new construction homes")
hoa_fee: int | None = Field(None, description="Search for homes where HOA fee is known and falls within specified range")
days_on_mls: int | None = Field(None, description="An integer value determined by the MLS to calculate days on market")
description: Description | None = None
tags: list[str] | None = None
details: list[dict] | None = None
@@ -190,8 +202,8 @@ class Property(BaseModel):
latitude: float | None = None
longitude: float | None = None
neighborhoods: Optional[str] = None
county: Optional[str] = None
fips_code: Optional[str] = None
county: Optional[str] = Field(None, description="County associated with home")
fips_code: Optional[str] = Field(None, description="The FIPS (Federal Information Processing Standard) code for the county")
nearby_schools: list[str] | None = None
assessed_value: int | None = None
estimated_value: int | None = None
@@ -199,3 +211,124 @@ class Property(BaseModel):
tax_history: list[dict] | None = None
advertisers: Advertisers | None = None
# Additional fields from GraphQL that aren't currently parsed
mls_status: str | None = None
last_sold_price: int | None = None
# Structured data from GraphQL
open_houses: list[OpenHouse] | None = None
pet_policy: PetPolicy | None = None
units: list[Unit] | None = None
monthly_fees: HomeMonthlyFee | None = Field(None, description="Monthly fees. Currently only some rental data will have them.")
one_time_fees: list[HomeOneTimeFee] | None = Field(None, description="One time fees. Currently only some rental data will have them.")
parking: HomeParkingDetails | None = Field(None, description="Parking information. Currently only some rental data will have it.")
terms: list[PropertyDetails] | None = None
popularity: Popularity | None = None
tax_record: TaxRecord | None = None
parcel_info: dict | None = None # Keep as dict for flexibility
current_estimates: list[PropertyEstimate] | None = None
estimates: dict | None = None # Keep as dict for flexibility
photos: list[dict] | None = None # Keep as dict for photo structure
flags: HomeFlags | None = Field(None, description="Home flags for Listing/Property")
# Specialized models for GraphQL types
class HomeMonthlyFee(BaseModel):
description: str | None = None
display_amount: str | None = None
class HomeOneTimeFee(BaseModel):
description: str | None = None
display_amount: str | None = None
class HomeParkingDetails(BaseModel):
unassigned_space_rent: int | None = None
assigned_spaces_available: int | None = None
description: str | None = Field(None, description="Parking information. Currently only some rental data will have it.")
assigned_space_rent: int | None = None
class PetPolicy(BaseModel):
cats: bool | None = Field(None, description="Search for homes which allow cats")
dogs: bool | None = Field(None, description="Search for homes which allow dogs")
dogs_small: bool | None = Field(None, description="Search for homes with allow small dogs")
dogs_large: bool | None = Field(None, description="Search for homes which allow large dogs")
class OpenHouse(BaseModel):
start_date: datetime | None = None
end_date: datetime | None = None
description: str | None = None
time_zone: str | None = None
dst: bool | None = None
href: HttpUrl | None = None
methods: list[str] | None = None
class HomeFlags(BaseModel):
is_pending: bool | None = None
is_contingent: bool | None = None
is_new_construction: bool | None = None
is_coming_soon: bool | None = None
is_new_listing: bool | None = None
is_price_reduced: bool | None = None
is_foreclosure: bool | None = None
class PopularityPeriod(BaseModel):
clicks_total: int | None = None
views_total: int | None = None
dwell_time_mean: float | None = None
dwell_time_median: float | None = None
leads_total: int | None = None
shares_total: int | None = None
saves_total: int | None = None
last_n_days: int | None = None
class Popularity(BaseModel):
periods: list[PopularityPeriod] | None = None
class TaxRecord(BaseModel):
cl_id: str | None = None
public_record_id: str | None = None
last_update_date: datetime | None = None
apn: str | None = None
tax_parcel_id: str | None = None
class PropertyEstimate(BaseModel):
estimate: int | None = None
estimate_high: int | None = None
estimate_low: int | None = None
date: datetime | None = None
is_best_home_value: bool | None = None
class PropertyDetails(BaseModel):
category: str | None = None
text: list[str] | None = None
parent_category: str | None = None
class UnitDescription(BaseModel):
baths_consolidated: str | None = None
baths: float | None = None # Changed to float to handle values like 2.5
beds: int | None = None
sqft: int | None = None
class UnitAvailability(BaseModel):
date: datetime | None = None
class Unit(BaseModel):
availability: UnitAvailability | None = None
description: UnitDescription | None = None
photos: list[dict] | None = None # Keep as dict for photo structure
list_price: int | None = None

View File

@@ -209,13 +209,15 @@ class RealtorScraper(Scraper):
property_url=result["href"],
property_id=property_id,
listing_id=result.get("listing_id"),
permalink=result.get("permalink"),
status=("PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper()),
list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=(result["list_date"].split("T")[0] if result.get("list_date") else None),
list_date=(datetime.fromisoformat(result["list_date"].split("T")[0]) if result.get("list_date") else None),
prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"),
last_sold_date=(datetime.fromisoformat(result["last_sold_date"]) if result.get("last_sold_date") else None),
pending_date=(datetime.fromisoformat(result["pending_date"].split("T")[0]) if result.get("pending_date") else None),
new_construction=result["flags"].get("is_new_construction") is True,
hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None),
latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None),
@@ -232,6 +234,26 @@ class RealtorScraper(Scraper):
advertisers=advertisers,
tax=prop_details.get("tax"),
tax_history=prop_details.get("tax_history"),
# Additional fields from GraphQL
mls_status=result.get("mls_status"),
last_sold_price=result.get("last_sold_price"),
tags=result.get("tags"),
details=result.get("details"),
open_houses=self._parse_open_houses(result.get("open_houses")),
pet_policy=result.get("pet_policy"),
units=self._parse_units(result.get("units")),
monthly_fees=result.get("monthly_fees"),
one_time_fees=result.get("one_time_fees"),
parking=result.get("parking"),
terms=result.get("terms"),
popularity=result.get("popularity"),
tax_record=self._parse_tax_record(result.get("tax_record")),
parcel_info=result.get("location", {}).get("parcel"),
current_estimates=self._parse_current_estimates(result.get("current_estimates")),
estimates=result.get("estimates"),
photos=result.get("photos"),
flags=result.get("flags"),
)
return realty_property
@@ -395,8 +417,9 @@ class RealtorScraper(Scraper):
#: address is retrieved on both homes and search homes, so when merged, homes overrides,
# this gets the internal data we want and only updates that (migrate to a func if more fields)
result["location"].update(specific_details_for_property["location"])
del specific_details_for_property["location"]
if "location" in specific_details_for_property:
result["location"].update(specific_details_for_property["location"])
del specific_details_for_property["location"]
result.update(specific_details_for_property)
@@ -614,6 +637,12 @@ class RealtorScraper(Scraper):
city=address["city"],
state=address["state_code"],
zip=address["postal_code"],
# Additional address fields
street_direction=address.get("street_direction"),
street_number=address.get("street_number"),
street_name=address.get("street_name"),
street_suffix=address.get("street_suffix"),
)
@staticmethod
@@ -630,7 +659,7 @@ class RealtorScraper(Scraper):
if style is not None:
style = style.upper()
primary_photo = ""
primary_photo = None
if (primary_photo_info := result.get("primary_photo")) and (
primary_photo_href := primary_photo_info.get("href")
):
@@ -654,6 +683,10 @@ class RealtorScraper(Scraper):
garage=description_data.get("garage"),
stories=description_data.get("stories"),
text=description_data.get("text"),
# Additional description fields
name=description_data.get("name"),
type=description_data.get("type"),
)
@staticmethod
@@ -685,3 +718,89 @@ class RealtorScraper(Scraper):
for photo_info in photos_info
if photo_info.get("href")
]
@staticmethod
def _parse_open_houses(open_houses_data: list[dict] | None) -> list[dict] | None:
"""Parse open houses data and convert date strings to datetime objects"""
if not open_houses_data:
return None
parsed_open_houses = []
for oh in open_houses_data:
parsed_oh = oh.copy()
# Parse start_date and end_date
if parsed_oh.get("start_date"):
try:
parsed_oh["start_date"] = datetime.fromisoformat(parsed_oh["start_date"].replace("Z", "+00:00"))
except (ValueError, AttributeError):
parsed_oh["start_date"] = None
if parsed_oh.get("end_date"):
try:
parsed_oh["end_date"] = datetime.fromisoformat(parsed_oh["end_date"].replace("Z", "+00:00"))
except (ValueError, AttributeError):
parsed_oh["end_date"] = None
parsed_open_houses.append(parsed_oh)
return parsed_open_houses
@staticmethod
def _parse_units(units_data: list[dict] | None) -> list[dict] | None:
"""Parse units data and convert date strings to datetime objects"""
if not units_data:
return None
parsed_units = []
for unit in units_data:
parsed_unit = unit.copy()
# Parse availability date
if parsed_unit.get("availability") and parsed_unit["availability"].get("date"):
try:
parsed_unit["availability"]["date"] = datetime.fromisoformat(parsed_unit["availability"]["date"].replace("Z", "+00:00"))
except (ValueError, AttributeError):
parsed_unit["availability"]["date"] = None
parsed_units.append(parsed_unit)
return parsed_units
@staticmethod
def _parse_tax_record(tax_record_data: dict | None) -> dict | None:
"""Parse tax record data and convert date strings to datetime objects"""
if not tax_record_data:
return None
parsed_tax_record = tax_record_data.copy()
# Parse last_update_date
if parsed_tax_record.get("last_update_date"):
try:
parsed_tax_record["last_update_date"] = datetime.fromisoformat(parsed_tax_record["last_update_date"].replace("Z", "+00:00"))
except (ValueError, AttributeError):
parsed_tax_record["last_update_date"] = None
return parsed_tax_record
@staticmethod
def _parse_current_estimates(estimates_data: list[dict] | None) -> list[dict] | None:
"""Parse current estimates data and convert date strings to datetime objects"""
if not estimates_data:
return None
parsed_estimates = []
for estimate in estimates_data:
parsed_estimate = estimate.copy()
# Parse date
if parsed_estimate.get("date"):
try:
parsed_estimate["date"] = datetime.fromisoformat(parsed_estimate["date"].replace("Z", "+00:00"))
except (ValueError, AttributeError):
parsed_estimate["date"] = None
parsed_estimates.append(parsed_estimate)
return parsed_estimates

File diff suppressed because it is too large Load Diff