enh:tax history

master v0.4.5
Cullen Watson 2025-01-06 05:28:36 -06:00
parent 6a3f7df087
commit 0de916e590
5 changed files with 71 additions and 15 deletions

View File

@ -38,9 +38,9 @@ filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property( properties = scrape_property(
location="San Diego, CA", location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent, pending) listing_type="sold", # or (for_sale, for_rent, pending)
property_type=['single_family','multi_family'],
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent) past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
# property_type=['single_family','multi_family'],
# date_from="2023-05-01", # alternative to past_days # date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28", # date_to="2023-05-28",
# foreclosure=True # foreclosure=True
@ -154,6 +154,14 @@ Property
│ ├── new_construction │ ├── new_construction
│ └── hoa_fee │ └── hoa_fee
├── Tax Information:
│ ├── year
│ ├── tax
│ ├── assessment
│ │ ├── building
│ │ ├── land
│ │ └── total
├── Location Details: ├── Location Details:
│ ├── latitude │ ├── latitude
│ ├── longitude │ ├── longitude

View File

@ -176,5 +176,7 @@ class Property:
nearby_schools: list[str] = None nearby_schools: list[str] = None
assessed_value: int | None = None assessed_value: int | None = None
estimated_value: int | None = None estimated_value: int | None = None
tax: int | None = None
tax_history: list[dict] | None = None
advertisers: Advertisers | None = None advertisers: Advertisers | None = None

View File

@ -13,7 +13,12 @@ from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt from tenacity import (
retry,
retry_if_exception_type,
wait_exponential,
stop_after_attempt,
)
from .. import Scraper from .. import Scraper
from ..models import ( from ..models import (
@ -202,27 +207,29 @@ class RealtorScraper(Scraper):
property_url=result["href"], property_url=result["href"],
property_id=property_id, property_id=property_id,
listing_id=result.get("listing_id"), listing_id=result.get("listing_id"),
status="PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper(), status=("PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper()),
list_price=result["list_price"], list_price=result["list_price"],
list_price_min=result["list_price_min"], list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"], list_price_max=result["list_price_max"],
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None, list_date=(result["list_date"].split("T")[0] if result.get("list_date") else None),
prc_sqft=result.get("price_per_sqft"), prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"), last_sold_date=result.get("last_sold_date"),
new_construction=result["flags"].get("is_new_construction") is True, new_construction=result["flags"].get("is_new_construction") is True,
hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None, hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None),
latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None, latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None),
longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None, longitude=(result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None),
address=self._parse_address(result, search_type="general_search"), address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result), description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result), neighborhoods=self._parse_neighborhoods(result),
county=result["location"]["county"].get("name") if result["location"]["county"] else None, county=(result["location"]["county"].get("name") if result["location"]["county"] else None),
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None, fips_code=(result["location"]["county"].get("fips_code") if result["location"]["county"] else None),
days_on_mls=self.calculate_days_on_mls(result), days_on_mls=self.calculate_days_on_mls(result),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=estimated_value if estimated_value else None, estimated_value=estimated_value if estimated_value else None,
advertisers=advertisers, advertisers=advertisers,
tax=prop_details.get("tax"),
tax_history=prop_details.get("tax_history"),
) )
return realty_property return realty_property
@ -447,7 +454,11 @@ class RealtorScraper(Scraper):
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
search_type=search_type, search_type=search_type,
) )
for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE) for i in range(
self.DEFAULT_PAGE_SIZE,
min(total, self.limit),
self.DEFAULT_PAGE_SIZE,
)
] ]
for future in as_completed(futures): for future in as_completed(futures):
@ -469,15 +480,45 @@ class RealtorScraper(Scraper):
def process_extra_property_details(self, result: dict) -> dict: def process_extra_property_details(self, result: dict) -> dict:
schools = self.get_key(result, ["nearbySchools", "schools"]) schools = self.get_key(result, ["nearbySchools", "schools"])
assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"]) assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"])
tax_history = self.get_key(result, ["taxHistory"])
schools = [school["district"]["name"] for school in schools if school["district"].get("name")] schools = [school["district"]["name"] for school in schools if school["district"].get("name")]
# Process tax history
latest_tax = None
processed_tax_history = None
if tax_history and isinstance(tax_history, list):
tax_history = sorted(tax_history, key=lambda x: x.get("year", 0), reverse=True)
if tax_history and "tax" in tax_history[0]:
latest_tax = tax_history[0]["tax"]
processed_tax_history = []
for entry in tax_history:
if "year" in entry and "tax" in entry:
processed_entry = {
"year": entry["year"],
"tax": entry["tax"],
}
if "assessment" in entry and isinstance(entry["assessment"], dict):
processed_entry["assessment"] = {
"building": entry["assessment"].get("building"),
"land": entry["assessment"].get("land"),
"total": entry["assessment"].get("total"),
}
processed_tax_history.append(processed_entry)
return { return {
"schools": schools if schools else None, "schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None, "assessed_value": assessed_value if assessed_value else None,
"tax": latest_tax,
"tax_history": processed_tax_history,
} }
@retry( @retry(
retry=retry_if_exception_type(JSONDecodeError), wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3) retry=retry_if_exception_type(JSONDecodeError),
wait=wait_exponential(min=4, max=10),
stop=stop_after_attempt(3),
) )
def get_prop_details(self, property_id: str) -> dict: def get_prop_details(self, property_id: str) -> dict:
if not self.extra_property_data: if not self.extra_property_data:
@ -570,7 +611,7 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])), alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None, style=(PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None),
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),

View File

@ -33,6 +33,8 @@ ordered_properties = [
"last_sold_date", "last_sold_date",
"assessed_value", "assessed_value",
"estimated_value", "estimated_value",
"tax",
"tax_history",
"new_construction", "new_construction",
"lot_sqft", "lot_sqft",
"price_per_sqft", "price_per_sqft",
@ -115,8 +117,11 @@ def process_result(result: Property) -> pd.DataFrame:
if description: if description:
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if isinstance(description.style, prop_data["style"] = (
str) else description.style.value if description.style else None description.style
if isinstance(description.style, str)
else description.style.value if description.style else None
)
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.4.4" version = "0.4.5"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"