2024-05-14 19:13:04 -07:00
|
|
|
from __future__ import annotations
|
2023-10-03 22:21:16 -07:00
|
|
|
import pandas as pd
|
2023-11-03 16:35:41 -07:00
|
|
|
from datetime import datetime
|
2024-08-20 05:19:15 -07:00
|
|
|
from .core.scrapers.models import Property, ListingType, Advertisers
|
2023-11-03 16:35:41 -07:00
|
|
|
from .exceptions import InvalidListingType, InvalidDate
|
2023-10-03 22:21:16 -07:00
|
|
|
|
|
|
|
ordered_properties = [
|
2023-10-04 18:44:47 -07:00
|
|
|
"property_url",
|
2024-09-06 15:49:07 -07:00
|
|
|
"property_id",
|
|
|
|
"listing_id",
|
2023-10-04 18:44:47 -07:00
|
|
|
"mls",
|
|
|
|
"mls_id",
|
|
|
|
"status",
|
2024-04-20 15:44:28 -07:00
|
|
|
"text",
|
2023-10-04 18:44:47 -07:00
|
|
|
"style",
|
2024-05-12 18:49:44 -07:00
|
|
|
"full_street_line",
|
2023-10-04 18:44:47 -07:00
|
|
|
"street",
|
|
|
|
"unit",
|
|
|
|
"city",
|
|
|
|
"state",
|
|
|
|
"zip_code",
|
|
|
|
"beds",
|
|
|
|
"full_baths",
|
|
|
|
"half_baths",
|
|
|
|
"sqft",
|
|
|
|
"year_built",
|
2023-10-09 09:00:36 -07:00
|
|
|
"days_on_mls",
|
2023-10-04 18:44:47 -07:00
|
|
|
"list_price",
|
2024-08-13 10:44:11 -07:00
|
|
|
"list_price_min",
|
|
|
|
"list_price_max",
|
2023-10-04 18:44:47 -07:00
|
|
|
"list_date",
|
|
|
|
"sold_price",
|
|
|
|
"last_sold_date",
|
2024-04-30 13:29:54 -07:00
|
|
|
"assessed_value",
|
|
|
|
"estimated_value",
|
2025-01-06 03:28:36 -08:00
|
|
|
"tax",
|
|
|
|
"tax_history",
|
2024-08-20 05:19:15 -07:00
|
|
|
"new_construction",
|
2023-10-04 18:44:47 -07:00
|
|
|
"lot_sqft",
|
|
|
|
"price_per_sqft",
|
|
|
|
"latitude",
|
|
|
|
"longitude",
|
2024-04-20 15:44:28 -07:00
|
|
|
"neighborhoods",
|
|
|
|
"county",
|
|
|
|
"fips_code",
|
2023-10-04 18:44:47 -07:00
|
|
|
"stories",
|
|
|
|
"hoa_fee",
|
|
|
|
"parking_garage",
|
2024-08-20 05:19:15 -07:00
|
|
|
"agent_id",
|
|
|
|
"agent_name",
|
2024-05-02 08:48:53 -07:00
|
|
|
"agent_email",
|
|
|
|
"agent_phones",
|
2024-08-23 10:47:45 -07:00
|
|
|
"agent_mls_set",
|
|
|
|
"agent_nrds_id",
|
2024-08-20 05:19:15 -07:00
|
|
|
"broker_id",
|
|
|
|
"broker_name",
|
|
|
|
"builder_id",
|
|
|
|
"builder_name",
|
|
|
|
"office_id",
|
2024-08-23 10:54:43 -07:00
|
|
|
"office_mls_set",
|
2024-08-20 05:19:15 -07:00
|
|
|
"office_name",
|
|
|
|
"office_email",
|
|
|
|
"office_phones",
|
2024-04-16 18:01:20 -07:00
|
|
|
"nearby_schools",
|
2023-12-01 22:39:28 -08:00
|
|
|
"primary_photo",
|
|
|
|
"alt_photos",
|
2023-10-03 22:21:16 -07:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def process_result(result: Property) -> pd.DataFrame:
|
|
|
|
prop_data = {prop: None for prop in ordered_properties}
|
|
|
|
prop_data.update(result.__dict__)
|
|
|
|
|
|
|
|
if "address" in prop_data:
|
|
|
|
address_data = prop_data["address"]
|
2024-05-12 18:49:44 -07:00
|
|
|
prop_data["full_street_line"] = address_data.full_line
|
2023-10-04 18:44:47 -07:00
|
|
|
prop_data["street"] = address_data.street
|
|
|
|
prop_data["unit"] = address_data.unit
|
|
|
|
prop_data["city"] = address_data.city
|
|
|
|
prop_data["state"] = address_data.state
|
|
|
|
prop_data["zip_code"] = address_data.zip
|
2023-10-03 22:21:16 -07:00
|
|
|
|
2024-08-20 05:19:15 -07:00
|
|
|
if "advertisers" in prop_data and prop_data.get("advertisers"):
|
|
|
|
advertiser_data: Advertisers | None = prop_data["advertisers"]
|
|
|
|
if advertiser_data.agent:
|
|
|
|
agent_data = advertiser_data.agent
|
|
|
|
prop_data["agent_id"] = agent_data.uuid
|
|
|
|
prop_data["agent_name"] = agent_data.name
|
|
|
|
prop_data["agent_email"] = agent_data.email
|
|
|
|
prop_data["agent_phones"] = agent_data.phones
|
2024-08-23 10:47:45 -07:00
|
|
|
prop_data["agent_mls_set"] = agent_data.mls_set
|
|
|
|
prop_data["agent_nrds_id"] = agent_data.nrds_id
|
2024-04-16 12:55:44 -07:00
|
|
|
|
2024-08-20 05:19:15 -07:00
|
|
|
if advertiser_data.broker:
|
|
|
|
broker_data = advertiser_data.broker
|
|
|
|
prop_data["broker_id"] = broker_data.uuid
|
|
|
|
prop_data["broker_name"] = broker_data.name
|
|
|
|
|
|
|
|
if advertiser_data.builder:
|
|
|
|
builder_data = advertiser_data.builder
|
|
|
|
prop_data["builder_id"] = builder_data.uuid
|
|
|
|
prop_data["builder_name"] = builder_data.name
|
|
|
|
|
|
|
|
if advertiser_data.office:
|
|
|
|
office_data = advertiser_data.office
|
|
|
|
prop_data["office_id"] = office_data.uuid
|
|
|
|
prop_data["office_name"] = office_data.name
|
|
|
|
prop_data["office_email"] = office_data.email
|
|
|
|
prop_data["office_phones"] = office_data.phones
|
2024-08-23 10:54:43 -07:00
|
|
|
prop_data["office_mls_set"] = office_data.mls_set
|
2024-05-11 21:35:29 -07:00
|
|
|
|
2023-10-04 18:44:47 -07:00
|
|
|
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
|
2024-04-16 18:01:20 -07:00
|
|
|
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
|
|
|
|
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
|
2023-10-03 22:21:16 -07:00
|
|
|
|
2023-10-04 06:58:55 -07:00
|
|
|
description = result.description
|
2024-06-06 15:24:12 -07:00
|
|
|
if description:
|
|
|
|
prop_data["primary_photo"] = description.primary_photo
|
|
|
|
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
|
2025-01-06 03:28:36 -08:00
|
|
|
prop_data["style"] = (
|
|
|
|
description.style
|
|
|
|
if isinstance(description.style, str)
|
|
|
|
else description.style.value if description.style else None
|
|
|
|
)
|
2024-06-06 15:24:12 -07:00
|
|
|
prop_data["beds"] = description.beds
|
|
|
|
prop_data["full_baths"] = description.baths_full
|
|
|
|
prop_data["half_baths"] = description.baths_half
|
|
|
|
prop_data["sqft"] = description.sqft
|
|
|
|
prop_data["lot_sqft"] = description.lot_sqft
|
|
|
|
prop_data["sold_price"] = description.sold_price
|
|
|
|
prop_data["year_built"] = description.year_built
|
|
|
|
prop_data["parking_garage"] = description.garage
|
|
|
|
prop_data["stories"] = description.stories
|
|
|
|
prop_data["text"] = description.text
|
2024-04-20 15:44:28 -07:00
|
|
|
|
2023-10-03 22:21:16 -07:00
|
|
|
properties_df = pd.DataFrame([prop_data])
|
|
|
|
properties_df = properties_df.reindex(columns=ordered_properties)
|
|
|
|
|
2023-10-04 08:11:53 -07:00
|
|
|
return properties_df[ordered_properties]
|
|
|
|
|
|
|
|
|
|
|
|
def validate_input(listing_type: str) -> None:
|
|
|
|
if listing_type.upper() not in ListingType.__members__:
|
2024-04-16 12:55:44 -07:00
|
|
|
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
|
2023-11-03 16:35:41 -07:00
|
|
|
|
|
|
|
|
|
|
|
def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
2024-07-15 07:19:57 -07:00
|
|
|
if isinstance(date_from, str) != isinstance(date_to, str):
|
2023-11-03 16:35:41 -07:00
|
|
|
raise InvalidDate("Both date_from and date_to must be provided.")
|
|
|
|
|
|
|
|
if date_from and date_to:
|
|
|
|
try:
|
|
|
|
date_from_obj = datetime.strptime(date_from, "%Y-%m-%d")
|
|
|
|
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")
|
|
|
|
|
|
|
|
if date_to_obj < date_from_obj:
|
|
|
|
raise InvalidDate("date_to must be after date_from.")
|
2024-05-02 08:48:53 -07:00
|
|
|
except ValueError:
|
2023-11-03 16:35:41 -07:00
|
|
|
raise InvalidDate(f"Invalid date format or range")
|
2024-07-15 07:19:57 -07:00
|
|
|
|
|
|
|
|
|
|
|
def validate_limit(limit: int) -> None:
|
|
|
|
#: 1 -> 10000 limit
|
|
|
|
|
|
|
|
if limit is not None and (limit < 1 or limit > 10000):
|
|
|
|
raise ValueError("Property limit must be between 1 and 10,000.")
|