parent
6a3f7df087
commit
0de916e590
10
README.md
10
README.md
|
@ -38,9 +38,9 @@ filename = f"HomeHarvest_{current_timestamp}.csv"
|
||||||
properties = scrape_property(
|
properties = scrape_property(
|
||||||
location="San Diego, CA",
|
location="San Diego, CA",
|
||||||
listing_type="sold", # or (for_sale, for_rent, pending)
|
listing_type="sold", # or (for_sale, for_rent, pending)
|
||||||
property_type=['single_family','multi_family'],
|
|
||||||
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
|
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
|
||||||
|
|
||||||
|
# property_type=['single_family','multi_family'],
|
||||||
# date_from="2023-05-01", # alternative to past_days
|
# date_from="2023-05-01", # alternative to past_days
|
||||||
# date_to="2023-05-28",
|
# date_to="2023-05-28",
|
||||||
# foreclosure=True
|
# foreclosure=True
|
||||||
|
@ -154,6 +154,14 @@ Property
|
||||||
│ ├── new_construction
|
│ ├── new_construction
|
||||||
│ └── hoa_fee
|
│ └── hoa_fee
|
||||||
|
|
||||||
|
├── Tax Information:
|
||||||
|
│ ├── year
|
||||||
|
│ ├── tax
|
||||||
|
│ ├── assessment
|
||||||
|
│ │ ├── building
|
||||||
|
│ │ ├── land
|
||||||
|
│ │ └── total
|
||||||
|
|
||||||
├── Location Details:
|
├── Location Details:
|
||||||
│ ├── latitude
|
│ ├── latitude
|
||||||
│ ├── longitude
|
│ ├── longitude
|
||||||
|
|
|
@ -176,5 +176,7 @@ class Property:
|
||||||
nearby_schools: list[str] = None
|
nearby_schools: list[str] = None
|
||||||
assessed_value: int | None = None
|
assessed_value: int | None = None
|
||||||
estimated_value: int | None = None
|
estimated_value: int | None = None
|
||||||
|
tax: int | None = None
|
||||||
|
tax_history: list[dict] | None = None
|
||||||
|
|
||||||
advertisers: Advertisers | None = None
|
advertisers: Advertisers | None = None
|
||||||
|
|
|
@ -13,7 +13,12 @@ from datetime import datetime
|
||||||
from json import JSONDecodeError
|
from json import JSONDecodeError
|
||||||
from typing import Dict, Union, Optional
|
from typing import Dict, Union, Optional
|
||||||
|
|
||||||
from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt
|
from tenacity import (
|
||||||
|
retry,
|
||||||
|
retry_if_exception_type,
|
||||||
|
wait_exponential,
|
||||||
|
stop_after_attempt,
|
||||||
|
)
|
||||||
|
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ..models import (
|
from ..models import (
|
||||||
|
@ -202,27 +207,29 @@ class RealtorScraper(Scraper):
|
||||||
property_url=result["href"],
|
property_url=result["href"],
|
||||||
property_id=property_id,
|
property_id=property_id,
|
||||||
listing_id=result.get("listing_id"),
|
listing_id=result.get("listing_id"),
|
||||||
status="PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper(),
|
status=("PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper()),
|
||||||
list_price=result["list_price"],
|
list_price=result["list_price"],
|
||||||
list_price_min=result["list_price_min"],
|
list_price_min=result["list_price_min"],
|
||||||
list_price_max=result["list_price_max"],
|
list_price_max=result["list_price_max"],
|
||||||
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
|
list_date=(result["list_date"].split("T")[0] if result.get("list_date") else None),
|
||||||
prc_sqft=result.get("price_per_sqft"),
|
prc_sqft=result.get("price_per_sqft"),
|
||||||
last_sold_date=result.get("last_sold_date"),
|
last_sold_date=result.get("last_sold_date"),
|
||||||
new_construction=result["flags"].get("is_new_construction") is True,
|
new_construction=result["flags"].get("is_new_construction") is True,
|
||||||
hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None,
|
hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None),
|
||||||
latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None,
|
latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None),
|
||||||
longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None,
|
longitude=(result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None),
|
||||||
address=self._parse_address(result, search_type="general_search"),
|
address=self._parse_address(result, search_type="general_search"),
|
||||||
description=self._parse_description(result),
|
description=self._parse_description(result),
|
||||||
neighborhoods=self._parse_neighborhoods(result),
|
neighborhoods=self._parse_neighborhoods(result),
|
||||||
county=result["location"]["county"].get("name") if result["location"]["county"] else None,
|
county=(result["location"]["county"].get("name") if result["location"]["county"] else None),
|
||||||
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
|
fips_code=(result["location"]["county"].get("fips_code") if result["location"]["county"] else None),
|
||||||
days_on_mls=self.calculate_days_on_mls(result),
|
days_on_mls=self.calculate_days_on_mls(result),
|
||||||
nearby_schools=prop_details.get("schools"),
|
nearby_schools=prop_details.get("schools"),
|
||||||
assessed_value=prop_details.get("assessed_value"),
|
assessed_value=prop_details.get("assessed_value"),
|
||||||
estimated_value=estimated_value if estimated_value else None,
|
estimated_value=estimated_value if estimated_value else None,
|
||||||
advertisers=advertisers,
|
advertisers=advertisers,
|
||||||
|
tax=prop_details.get("tax"),
|
||||||
|
tax_history=prop_details.get("tax_history"),
|
||||||
)
|
)
|
||||||
return realty_property
|
return realty_property
|
||||||
|
|
||||||
|
@ -447,7 +454,11 @@ class RealtorScraper(Scraper):
|
||||||
variables=search_variables | {"offset": i},
|
variables=search_variables | {"offset": i},
|
||||||
search_type=search_type,
|
search_type=search_type,
|
||||||
)
|
)
|
||||||
for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE)
|
for i in range(
|
||||||
|
self.DEFAULT_PAGE_SIZE,
|
||||||
|
min(total, self.limit),
|
||||||
|
self.DEFAULT_PAGE_SIZE,
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
|
@ -469,15 +480,45 @@ class RealtorScraper(Scraper):
|
||||||
def process_extra_property_details(self, result: dict) -> dict:
|
def process_extra_property_details(self, result: dict) -> dict:
|
||||||
schools = self.get_key(result, ["nearbySchools", "schools"])
|
schools = self.get_key(result, ["nearbySchools", "schools"])
|
||||||
assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"])
|
assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"])
|
||||||
|
tax_history = self.get_key(result, ["taxHistory"])
|
||||||
|
|
||||||
schools = [school["district"]["name"] for school in schools if school["district"].get("name")]
|
schools = [school["district"]["name"] for school in schools if school["district"].get("name")]
|
||||||
|
|
||||||
|
# Process tax history
|
||||||
|
latest_tax = None
|
||||||
|
processed_tax_history = None
|
||||||
|
if tax_history and isinstance(tax_history, list):
|
||||||
|
tax_history = sorted(tax_history, key=lambda x: x.get("year", 0), reverse=True)
|
||||||
|
|
||||||
|
if tax_history and "tax" in tax_history[0]:
|
||||||
|
latest_tax = tax_history[0]["tax"]
|
||||||
|
|
||||||
|
processed_tax_history = []
|
||||||
|
for entry in tax_history:
|
||||||
|
if "year" in entry and "tax" in entry:
|
||||||
|
processed_entry = {
|
||||||
|
"year": entry["year"],
|
||||||
|
"tax": entry["tax"],
|
||||||
|
}
|
||||||
|
if "assessment" in entry and isinstance(entry["assessment"], dict):
|
||||||
|
processed_entry["assessment"] = {
|
||||||
|
"building": entry["assessment"].get("building"),
|
||||||
|
"land": entry["assessment"].get("land"),
|
||||||
|
"total": entry["assessment"].get("total"),
|
||||||
|
}
|
||||||
|
processed_tax_history.append(processed_entry)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"schools": schools if schools else None,
|
"schools": schools if schools else None,
|
||||||
"assessed_value": assessed_value if assessed_value else None,
|
"assessed_value": assessed_value if assessed_value else None,
|
||||||
|
"tax": latest_tax,
|
||||||
|
"tax_history": processed_tax_history,
|
||||||
}
|
}
|
||||||
|
|
||||||
@retry(
|
@retry(
|
||||||
retry=retry_if_exception_type(JSONDecodeError), wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3)
|
retry=retry_if_exception_type(JSONDecodeError),
|
||||||
|
wait=wait_exponential(min=4, max=10),
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
)
|
)
|
||||||
def get_prop_details(self, property_id: str) -> dict:
|
def get_prop_details(self, property_id: str) -> dict:
|
||||||
if not self.extra_property_data:
|
if not self.extra_property_data:
|
||||||
|
@ -570,7 +611,7 @@ class RealtorScraper(Scraper):
|
||||||
return Description(
|
return Description(
|
||||||
primary_photo=primary_photo,
|
primary_photo=primary_photo,
|
||||||
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
|
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
|
||||||
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
|
style=(PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None),
|
||||||
beds=description_data.get("beds"),
|
beds=description_data.get("beds"),
|
||||||
baths_full=description_data.get("baths_full"),
|
baths_full=description_data.get("baths_full"),
|
||||||
baths_half=description_data.get("baths_half"),
|
baths_half=description_data.get("baths_half"),
|
||||||
|
|
|
@ -33,6 +33,8 @@ ordered_properties = [
|
||||||
"last_sold_date",
|
"last_sold_date",
|
||||||
"assessed_value",
|
"assessed_value",
|
||||||
"estimated_value",
|
"estimated_value",
|
||||||
|
"tax",
|
||||||
|
"tax_history",
|
||||||
"new_construction",
|
"new_construction",
|
||||||
"lot_sqft",
|
"lot_sqft",
|
||||||
"price_per_sqft",
|
"price_per_sqft",
|
||||||
|
@ -115,8 +117,11 @@ def process_result(result: Property) -> pd.DataFrame:
|
||||||
if description:
|
if description:
|
||||||
prop_data["primary_photo"] = description.primary_photo
|
prop_data["primary_photo"] = description.primary_photo
|
||||||
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
|
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
|
||||||
prop_data["style"] = description.style if isinstance(description.style,
|
prop_data["style"] = (
|
||||||
str) else description.style.value if description.style else None
|
description.style
|
||||||
|
if isinstance(description.style, str)
|
||||||
|
else description.style.value if description.style else None
|
||||||
|
)
|
||||||
prop_data["beds"] = description.beds
|
prop_data["beds"] = description.beds
|
||||||
prop_data["full_baths"] = description.baths_full
|
prop_data["full_baths"] = description.baths_full
|
||||||
prop_data["half_baths"] = description.baths_half
|
prop_data["half_baths"] = description.baths_half
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.4.4"
|
version = "0.4.5"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/HomeHarvest"
|
homepage = "https://github.com/Bunsly/HomeHarvest"
|
||||||
|
|
Loading…
Reference in New Issue