Compare commits

..

3 Commits

Author SHA1 Message Date
zachary e378feeefe - bug fixes 2025-04-12 18:34:35 -07:00
zachary 8a5683fe79 - return type parameter
- optimized get extra fields with query clustering
2025-04-12 17:55:52 -07:00
Zachary Hampton 65f799a27d
Update README.md 2025-02-21 13:33:32 -07:00
8 changed files with 196 additions and 72 deletions

View File

@ -2,10 +2,6 @@
**HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings. **HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings.
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
## HomeHarvest Features ## HomeHarvest Features
- **Source**: Fetches properties directly from **Realtor.com**. - **Source**: Fetches properties directly from **Realtor.com**.
@ -87,7 +83,12 @@ Optional
- 'farm' - 'farm'
- 'land' - 'land'
- 'mobile' - 'mobile'
├── return_type (option): Choose the return type.
│ - 'pandas' (default)
│ - 'pydantic'
│ - 'raw' (json)
├── radius (decimal): Radius in miles to find comparable properties based on individual addresses. ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
│ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored) │ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)

View File

@ -3,12 +3,13 @@ import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
def scrape_property( def scrape_property(
location: str, location: str,
listing_type: str = "for_sale", listing_type: str = "for_sale",
return_type: str = "pandas",
property_type: list[str] | None = None, property_type: list[str] | None = None,
radius: float = None, radius: float = None,
mls_only: bool = False, mls_only: bool = False,
@ -19,12 +20,13 @@ def scrape_property(
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False, exclude_pending: bool = False,
limit: int = 10000, limit: int = 10000
) -> pd.DataFrame: ) -> pd.DataFrame | list[dict] | list[Property]:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold, pending) :param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param return_type: Return type (pandas, pydantic, raw)
:param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile) :param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
@ -42,7 +44,8 @@ def scrape_property(
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType[listing_type.upper()], listing_type=ListingType(listing_type.upper()),
return_type=ReturnType(return_type.lower()),
property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None, property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
proxy=proxy, proxy=proxy,
radius=radius, radius=radius,
@ -59,6 +62,9 @@ def scrape_property(
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
results = site.search() results = site.search()
if scraper_input.return_type != ReturnType.pandas:
return results
properties_dfs = [df for result in results if not (df := process_result(result)).empty] properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()

View File

@ -1,11 +1,13 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from typing import Union
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
import json import json
@ -24,6 +26,7 @@ class ScraperInput:
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False exclude_pending: bool | None = False
limit: int = 10000 limit: int = 10000
return_type: ReturnType = ReturnType.pandas
class Scraper: class Scraper:
@ -81,8 +84,9 @@ class Scraper:
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit self.limit = scraper_input.limit
self.return_type = scraper_input.return_type
def search(self) -> list[Property]: ... def search(self) -> list[Union[Property | dict]]: ...
@staticmethod @staticmethod
def _parse_home(home) -> Property: ... def _parse_home(home) -> Property: ...

View File

@ -4,6 +4,12 @@ from enum import Enum
from typing import Optional from typing import Optional
class ReturnType(Enum):
pydantic = "pydantic"
pandas = "pandas"
raw = "raw"
class SiteName(Enum): class SiteName(Enum):
ZILLOW = "zillow" ZILLOW = "zillow"
REDFIN = "redfin" REDFIN = "redfin"
@ -19,6 +25,7 @@ class SiteName(Enum):
class SearchPropertyType(Enum): class SearchPropertyType(Enum):
SINGLE_FAMILY = "single_family" SINGLE_FAMILY = "single_family"
APARTMENT = "apartment"
CONDOS = "condos" CONDOS = "condos"
CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop" CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop"
CONDO_TOWNHOME = "condo_townhome" CONDO_TOWNHOME = "condo_townhome"
@ -148,6 +155,9 @@ class Property:
property_url: str property_url: str
property_id: str property_id: str
#: allows_cats: bool
#: allows_dogs: bool
listing_id: str | None = None listing_id: str | None = None
mls: str | None = None mls: str | None = None
@ -167,6 +177,8 @@ class Property:
hoa_fee: int | None = None hoa_fee: int | None = None
days_on_mls: int | None = None days_on_mls: int | None = None
description: Description | None = None description: Description | None = None
tags: list[str] | None = None
details: list[dict] | None = None
latitude: float | None = None latitude: float | None = None
longitude: float | None = None longitude: float | None = None

View File

@ -32,8 +32,9 @@ from ..models import (
Builder, Builder,
Advertisers, Advertisers,
Office, Office,
ReturnType
) )
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
@ -120,7 +121,7 @@ class RealtorScraper(Scraper):
property_info = response_json["data"]["home"] property_info = response_json["data"]["home"]
return [self.process_property(property_info, "home")] return [self.process_property(property_info)]
@staticmethod @staticmethod
def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None: def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None:
@ -168,7 +169,7 @@ class RealtorScraper(Scraper):
return processed_advertisers return processed_advertisers
def process_property(self, result: dict, query_name: str) -> Property | None: def process_property(self, result: dict) -> Property | None:
mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None
if not mls and self.mls_only: if not mls and self.mls_only:
@ -188,9 +189,7 @@ class RealtorScraper(Scraper):
return return
property_id = result["property_id"] property_id = result["property_id"]
prop_details = self.get_prop_details(property_id) if self.extra_property_data and query_name != "home" else {} prop_details = self.process_extra_property_details(result) if self.extra_property_data else {}
if not prop_details:
prop_details = self.process_extra_property_details(result)
property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues") property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues")
estimated_value = self.get_key(property_estimates_root, [0, "estimate"]) estimated_value = self.get_key(property_estimates_root, [0, "estimate"])
@ -233,7 +232,7 @@ class RealtorScraper(Scraper):
) )
return realty_property return realty_property
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]: def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]:
""" """
Handles a location area & returns a list of properties Handles a location area & returns a list of properties
""" """
@ -258,7 +257,7 @@ class RealtorScraper(Scraper):
sort_param = ( sort_param = (
"sort: [{ field: sold_date, direction: desc }]" "sort: [{ field: sold_date, direction: desc }]"
if self.listing_type == ListingType.SOLD if self.listing_type == ListingType.SOLD
else "sort: [{ field: list_date, direction: desc }]" else "" #: "sort: [{ field: list_date, direction: desc }]" #: prioritize normal fractal sort from realtor
) )
pending_or_contingent_param = ( pending_or_contingent_param = (
@ -306,24 +305,20 @@ class RealtorScraper(Scraper):
) )
elif search_type == "area": #: general search, came from a general location elif search_type == "area": #: general search, came from a general location
query = """query Home_search( query = """query Home_search(
$city: String, $location: String!,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int, $offset: Int,
) { ) {
home_search( home_search(
query: { query: {
%s %s
city: $city search_location: {location: $location}
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s status: %s
unique: true
%s %s
%s %s
%s %s
} }
bucket: { sort: "fractal_v1.1.3_fr" }
%s %s
limit: 200 limit: 200
offset: $offset offset: $offset
@ -363,7 +358,7 @@ class RealtorScraper(Scraper):
response_json = response.json() response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search" search_key = "home_search" if "home_search" in query else "property_search"
properties: list[Property] = [] properties: list[Union[Property, dict]] = []
if ( if (
response_json is None response_json is None
@ -381,15 +376,25 @@ class RealtorScraper(Scraper):
#: limit the number of properties to be processed #: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50 #: example, if your offset is 200, and your limit is 250, return 50
properties_list = properties_list[: self.limit - offset] properties_list: list[dict] = properties_list[: self.limit - offset]
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: if self.extra_property_data:
futures = [executor.submit(self.process_property, result, search_key) for result in properties_list] property_ids = [data["property_id"] for data in properties_list]
extra_property_details = self.get_bulk_prop_details(property_ids) or {}
for future in as_completed(futures): for result in properties_list:
result = future.result() result.update(extra_property_details.get(result["property_id"], {}))
if result:
properties.append(result) if self.return_type != ReturnType.raw:
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [executor.submit(self.process_property, result) for result in properties_list]
for future in as_completed(futures):
result = future.result()
if result:
properties.append(result)
else:
properties = properties_list
return { return {
"total": total_properties, "total": total_properties,
@ -434,10 +439,7 @@ class RealtorScraper(Scraper):
else: #: general search, location else: #: general search, location
search_variables |= { search_variables |= {
"city": location_info.get("city"), "location": self.location,
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
} }
if self.foreclosure: if self.foreclosure:
@ -520,28 +522,35 @@ class RealtorScraper(Scraper):
wait=wait_exponential(min=4, max=10), wait=wait_exponential(min=4, max=10),
stop=stop_after_attempt(3), stop=stop_after_attempt(3),
) )
def get_prop_details(self, property_id: str) -> dict: def get_bulk_prop_details(self, property_ids: list[str]) -> dict:
if not self.extra_property_data: """
Fetch extra property details for multiple properties in a single GraphQL query.
Returns a map of property_id to its details.
"""
if not self.extra_property_data or not property_ids:
return {} return {}
query = """query GetHome($property_id: ID!) { property_ids = list(set(property_ids))
home(property_id: $property_id) {
__typename
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { # Construct the bulk query
__typename schools { district { __typename id name } } fragments = "\n".join(
} f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}'
taxHistory: tax_history { __typename tax year assessment { __typename building land total } } for property_id in property_ids
} )
}""" query = f"""{HOME_FRAGMENT}
variables = {"property_id": property_id} query GetHomes {{
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query, "variables": variables}) {fragments}
}}"""
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query})
data = response.json() data = response.json()
property_details = data["data"]["home"]
return self.process_extra_property_details(property_details) if "data" not in data:
return {}
properties = data["data"]
return {data.replace('home_', ''): properties[data] for data in properties if properties[data]}
@staticmethod @staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]: def _parse_neighborhoods(result: dict) -> Optional[str]:

View File

@ -11,6 +11,34 @@ _SEARCH_HOMES_DATA_BASE = """{
list_price_max list_price_max
list_price_min list_price_min
price_per_sqft price_per_sqft
tags
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
list_price
__typename
}
flags { flags {
is_contingent is_contingent
is_pending is_pending
@ -64,11 +92,14 @@ _SEARCH_HOMES_DATA_BASE = """{
tax_record { tax_record {
public_record_id public_record_id
} }
primary_photo { primary_photo(https: true) {
href href
} }
photos { photos(https: true) {
href href
tags {
label
}
} }
advertisers { advertisers {
email email
@ -116,15 +147,63 @@ _SEARCH_HOMES_DATA_BASE = """{
} }
rental_management { rental_management {
name name
href
fulfillment_id fulfillment_id
} }
} }
""" """
HOME_FRAGMENT = """
fragment HomeData on Home {
property_id
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
}
"""
HOMES_DATA = """%s HOMES_DATA = """%s
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } } __typename schools { district { __typename id name } }
} }
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } } taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
estimates { estimates {
__typename __typename
@ -141,19 +220,19 @@ HOMES_DATA = """%s
}""" % _SEARCH_HOMES_DATA_BASE }""" % _SEARCH_HOMES_DATA_BASE
SEARCH_HOMES_DATA = """%s SEARCH_HOMES_DATA = """%s
current_estimates { current_estimates {
__typename __typename
source { source {
__typename __typename
type type
name name
} }
estimate estimate
estimateHigh: estimate_high estimateHigh: estimate_high
estimateLow: estimate_low estimateLow: estimate_low
date date
isBestHomeValue: isbest_homevalue isBestHomeValue: isbest_homevalue
} }
}""" % _SEARCH_HOMES_DATA_BASE }""" % _SEARCH_HOMES_DATA_BASE
GENERAL_RESULTS_QUERY = """{ GENERAL_RESULTS_QUERY = """{

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.4.5" version = "0.4.7"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@ -1,4 +1,5 @@
from homeharvest import scrape_property from homeharvest import scrape_property, Property
import pandas as pd
def test_realtor_pending_or_contingent(): def test_realtor_pending_or_contingent():
@ -287,3 +288,15 @@ def test_phone_number_matching():
#: assert phone numbers are the same #: assert phone numbers are the same
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0] assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
def test_return_type():
results = {
"pandas": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100),
"pydantic": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="pydantic"),
"raw": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="raw"),
}
assert isinstance(results["pandas"], pd.DataFrame)
assert isinstance(results["pydantic"][0], Property)
assert isinstance(results["raw"][0], dict)