Compare commits

...

6 Commits

Author SHA1 Message Date
zachary e378feeefe - bug fixes 2025-04-12 18:34:35 -07:00
zachary 8a5683fe79 - return type parameter
- optimized get extra fields with query clustering
2025-04-12 17:55:52 -07:00
Zachary Hampton 65f799a27d
Update README.md 2025-02-21 13:33:32 -07:00
Cullen Watson 0de916e590 enh:tax history 2025-01-06 05:28:36 -06:00
Cullen Watson 6a3f7df087 chore:yml 2024-11-05 23:55:59 -06:00
Cullen Watson a75bcc2aa0
docs:readme 2024-11-04 10:22:32 -06:00
10 changed files with 267 additions and 86 deletions

1
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1 @@
github: Bunsly

View File

@ -2,10 +2,6 @@
**HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings.
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
## HomeHarvest Features
- **Source**: Fetches properties directly from **Realtor.com**.
@ -38,9 +34,9 @@ filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent, pending)
property_type='single_family',
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
# property_type=['single_family','multi_family'],
# date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28",
# foreclosure=True
@ -87,7 +83,12 @@ Optional
- 'farm'
- 'land'
- 'mobile'
├── return_type (option): Choose the return type.
│ - 'pandas' (default)
│ - 'pydantic'
│ - 'raw' (json)
├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
│ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
@ -154,6 +155,14 @@ Property
│ ├── new_construction
│ └── hoa_fee
├── Tax Information:
│ ├── year
│ ├── tax
│ ├── assessment
│ │ ├── building
│ │ ├── land
│ │ └── total
├── Location Details:
│ ├── latitude
│ ├── longitude

View File

@ -3,12 +3,13 @@ import pandas as pd
from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
def scrape_property(
location: str,
listing_type: str = "for_sale",
return_type: str = "pandas",
property_type: list[str] | None = None,
radius: float = None,
mls_only: bool = False,
@ -19,12 +20,13 @@ def scrape_property(
foreclosure: bool = None,
extra_property_data: bool = True,
exclude_pending: bool = False,
limit: int = 10000,
) -> pd.DataFrame:
limit: int = 10000
) -> pd.DataFrame | list[dict] | list[Property]:
"""
Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param return_type: Return type (pandas, pydantic, raw)
:param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs.
@ -42,7 +44,8 @@ def scrape_property(
scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
listing_type=ListingType(listing_type.upper()),
return_type=ReturnType(return_type.lower()),
property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
proxy=proxy,
radius=radius,
@ -59,6 +62,9 @@ def scrape_property(
site = RealtorScraper(scraper_input)
results = site.search()
if scraper_input.return_type != ReturnType.pandas:
return results
properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs:
return pd.DataFrame()

View File

@ -1,11 +1,13 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Union
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid
from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
import json
@ -24,6 +26,7 @@ class ScraperInput:
extra_property_data: bool | None = True
exclude_pending: bool | None = False
limit: int = 10000
return_type: ReturnType = ReturnType.pandas
class Scraper:
@ -81,8 +84,9 @@ class Scraper:
self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit
self.return_type = scraper_input.return_type
def search(self) -> list[Property]: ...
def search(self) -> list[Union[Property | dict]]: ...
@staticmethod
def _parse_home(home) -> Property: ...

View File

@ -4,6 +4,12 @@ from enum import Enum
from typing import Optional
class ReturnType(Enum):
pydantic = "pydantic"
pandas = "pandas"
raw = "raw"
class SiteName(Enum):
ZILLOW = "zillow"
REDFIN = "redfin"
@ -19,6 +25,7 @@ class SiteName(Enum):
class SearchPropertyType(Enum):
SINGLE_FAMILY = "single_family"
APARTMENT = "apartment"
CONDOS = "condos"
CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop"
CONDO_TOWNHOME = "condo_townhome"
@ -148,6 +155,9 @@ class Property:
property_url: str
property_id: str
#: allows_cats: bool
#: allows_dogs: bool
listing_id: str | None = None
mls: str | None = None
@ -167,6 +177,8 @@ class Property:
hoa_fee: int | None = None
days_on_mls: int | None = None
description: Description | None = None
tags: list[str] | None = None
details: list[dict] | None = None
latitude: float | None = None
longitude: float | None = None
@ -176,5 +188,7 @@ class Property:
nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None
tax: int | None = None
tax_history: list[dict] | None = None
advertisers: Advertisers | None = None

View File

@ -13,7 +13,12 @@ from datetime import datetime
from json import JSONDecodeError
from typing import Dict, Union, Optional
from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt
from tenacity import (
retry,
retry_if_exception_type,
wait_exponential,
stop_after_attempt,
)
from .. import Scraper
from ..models import (
@ -27,8 +32,9 @@ from ..models import (
Builder,
Advertisers,
Office,
ReturnType
)
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT
class RealtorScraper(Scraper):
@ -115,7 +121,7 @@ class RealtorScraper(Scraper):
property_info = response_json["data"]["home"]
return [self.process_property(property_info, "home")]
return [self.process_property(property_info)]
@staticmethod
def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None:
@ -163,7 +169,7 @@ class RealtorScraper(Scraper):
return processed_advertisers
def process_property(self, result: dict, query_name: str) -> Property | None:
def process_property(self, result: dict) -> Property | None:
mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None
if not mls and self.mls_only:
@ -183,9 +189,7 @@ class RealtorScraper(Scraper):
return
property_id = result["property_id"]
prop_details = self.get_prop_details(property_id) if self.extra_property_data and query_name != "home" else {}
if not prop_details:
prop_details = self.process_extra_property_details(result)
prop_details = self.process_extra_property_details(result) if self.extra_property_data else {}
property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues")
estimated_value = self.get_key(property_estimates_root, [0, "estimate"])
@ -202,31 +206,33 @@ class RealtorScraper(Scraper):
property_url=result["href"],
property_id=property_id,
listing_id=result.get("listing_id"),
status="PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper(),
status=("PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper()),
list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
list_date=(result["list_date"].split("T")[0] if result.get("list_date") else None),
prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"),
new_construction=result["flags"].get("is_new_construction") is True,
hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None,
latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None,
longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None,
hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None),
latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None),
longitude=(result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None),
address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result),
county=result["location"]["county"].get("name") if result["location"]["county"] else None,
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
county=(result["location"]["county"].get("name") if result["location"]["county"] else None),
fips_code=(result["location"]["county"].get("fips_code") if result["location"]["county"] else None),
days_on_mls=self.calculate_days_on_mls(result),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=estimated_value if estimated_value else None,
advertisers=advertisers,
tax=prop_details.get("tax"),
tax_history=prop_details.get("tax_history"),
)
return realty_property
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]:
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]:
"""
Handles a location area & returns a list of properties
"""
@ -251,7 +257,7 @@ class RealtorScraper(Scraper):
sort_param = (
"sort: [{ field: sold_date, direction: desc }]"
if self.listing_type == ListingType.SOLD
else "sort: [{ field: list_date, direction: desc }]"
else "" #: "sort: [{ field: list_date, direction: desc }]" #: prioritize normal fractal sort from realtor
)
pending_or_contingent_param = (
@ -299,24 +305,20 @@ class RealtorScraper(Scraper):
)
elif search_type == "area": #: general search, came from a general location
query = """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
$location: String!,
$offset: Int,
) {
home_search(
query: {
%s
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
search_location: {location: $location}
status: %s
unique: true
%s
%s
%s
}
bucket: { sort: "fractal_v1.1.3_fr" }
%s
limit: 200
offset: $offset
@ -356,7 +358,7 @@ class RealtorScraper(Scraper):
response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search"
properties: list[Property] = []
properties: list[Union[Property, dict]] = []
if (
response_json is None
@ -374,15 +376,25 @@ class RealtorScraper(Scraper):
#: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50
properties_list = properties_list[: self.limit - offset]
properties_list: list[dict] = properties_list[: self.limit - offset]
if self.extra_property_data:
property_ids = [data["property_id"] for data in properties_list]
extra_property_details = self.get_bulk_prop_details(property_ids) or {}
for result in properties_list:
result.update(extra_property_details.get(result["property_id"], {}))
if self.return_type != ReturnType.raw:
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [executor.submit(self.process_property, result, search_key) for result in properties_list]
futures = [executor.submit(self.process_property, result) for result in properties_list]
for future in as_completed(futures):
result = future.result()
if result:
properties.append(result)
else:
properties = properties_list
return {
"total": total_properties,
@ -427,10 +439,7 @@ class RealtorScraper(Scraper):
else: #: general search, location
search_variables |= {
"city": location_info.get("city"),
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
"location": self.location,
}
if self.foreclosure:
@ -447,7 +456,11 @@ class RealtorScraper(Scraper):
variables=search_variables | {"offset": i},
search_type=search_type,
)
for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE)
for i in range(
self.DEFAULT_PAGE_SIZE,
min(total, self.limit),
self.DEFAULT_PAGE_SIZE,
)
]
for future in as_completed(futures):
@ -469,38 +482,75 @@ class RealtorScraper(Scraper):
def process_extra_property_details(self, result: dict) -> dict:
schools = self.get_key(result, ["nearbySchools", "schools"])
assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"])
tax_history = self.get_key(result, ["taxHistory"])
schools = [school["district"]["name"] for school in schools if school["district"].get("name")]
# Process tax history
latest_tax = None
processed_tax_history = None
if tax_history and isinstance(tax_history, list):
tax_history = sorted(tax_history, key=lambda x: x.get("year", 0), reverse=True)
if tax_history and "tax" in tax_history[0]:
latest_tax = tax_history[0]["tax"]
processed_tax_history = []
for entry in tax_history:
if "year" in entry and "tax" in entry:
processed_entry = {
"year": entry["year"],
"tax": entry["tax"],
}
if "assessment" in entry and isinstance(entry["assessment"], dict):
processed_entry["assessment"] = {
"building": entry["assessment"].get("building"),
"land": entry["assessment"].get("land"),
"total": entry["assessment"].get("total"),
}
processed_tax_history.append(processed_entry)
return {
"schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None,
"tax": latest_tax,
"tax_history": processed_tax_history,
}
@retry(
retry=retry_if_exception_type(JSONDecodeError), wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3)
retry=retry_if_exception_type(JSONDecodeError),
wait=wait_exponential(min=4, max=10),
stop=stop_after_attempt(3),
)
def get_prop_details(self, property_id: str) -> dict:
if not self.extra_property_data:
def get_bulk_prop_details(self, property_ids: list[str]) -> dict:
"""
Fetch extra property details for multiple properties in a single GraphQL query.
Returns a map of property_id to its details.
"""
if not self.extra_property_data or not property_ids:
return {}
query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) {
__typename
property_ids = list(set(property_ids))
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
}
}"""
# Construct the bulk query
fragments = "\n".join(
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}'
for property_id in property_ids
)
query = f"""{HOME_FRAGMENT}
variables = {"property_id": property_id}
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query, "variables": variables})
query GetHomes {{
{fragments}
}}"""
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query})
data = response.json()
property_details = data["data"]["home"]
return self.process_extra_property_details(property_details)
if "data" not in data:
return {}
properties = data["data"]
return {data.replace('home_', ''): properties[data] for data in properties if properties[data]}
@staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]:
@ -570,7 +620,7 @@ class RealtorScraper(Scraper):
return Description(
primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
style=(PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None),
beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"),

View File

@ -11,6 +11,34 @@ _SEARCH_HOMES_DATA_BASE = """{
list_price_max
list_price_min
price_per_sqft
tags
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
list_price
__typename
}
flags {
is_contingent
is_pending
@ -64,11 +92,14 @@ _SEARCH_HOMES_DATA_BASE = """{
tax_record {
public_record_id
}
primary_photo {
primary_photo(https: true) {
href
}
photos {
photos(https: true) {
href
tags {
label
}
}
advertisers {
email
@ -116,15 +147,63 @@ _SEARCH_HOMES_DATA_BASE = """{
}
rental_management {
name
href
fulfillment_id
}
}
"""
HOME_FRAGMENT = """
fragment HomeData on Home {
property_id
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
}
"""
HOMES_DATA = """%s
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
estimates {
__typename

View File

@ -33,6 +33,8 @@ ordered_properties = [
"last_sold_date",
"assessed_value",
"estimated_value",
"tax",
"tax_history",
"new_construction",
"lot_sqft",
"price_per_sqft",
@ -115,8 +117,11 @@ def process_result(result: Property) -> pd.DataFrame:
if description:
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if isinstance(description.style,
str) else description.style.value if description.style else None
prop_data["style"] = (
description.style
if isinstance(description.style, str)
else description.style.value if description.style else None
)
prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.4.4"
version = "0.4.7"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@ -1,4 +1,5 @@
from homeharvest import scrape_property
from homeharvest import scrape_property, Property
import pandas as pd
def test_realtor_pending_or_contingent():
@ -287,3 +288,15 @@ def test_phone_number_matching():
#: assert phone numbers are the same
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
def test_return_type():
results = {
"pandas": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100),
"pydantic": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="pydantic"),
"raw": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="raw"),
}
assert isinstance(results["pandas"], pd.DataFrame)
assert isinstance(results["pydantic"][0], Property)
assert isinstance(results["raw"][0], dict)