Compare commits

...

12 Commits

Author SHA1 Message Date
zachary e378feeefe - bug fixes 2025-04-12 18:34:35 -07:00
zachary 8a5683fe79 - return type parameter
- optimized get extra fields with query clustering
2025-04-12 17:55:52 -07:00
Zachary Hampton 65f799a27d
Update README.md 2025-02-21 13:33:32 -07:00
Cullen Watson 0de916e590 enh:tax history 2025-01-06 05:28:36 -06:00
Cullen Watson 6a3f7df087 chore:yml 2024-11-05 23:55:59 -06:00
Cullen Watson a75bcc2aa0
docs:readme 2024-11-04 10:22:32 -06:00
Cullen Watson 1082b86fa1
docs:readme 2024-11-03 17:23:58 -06:00
Cullen Watson 8e04f6b117
enh: property type (#102) 2024-11-03 17:23:07 -06:00
Zachary Hampton 1f717bd9e3 - switch eps
- new hrefs
- property_id, listing_id data points
2024-09-06 15:49:07 -07:00
Zachary Hampton 8cfe056f79 - office mls set 2024-08-23 10:54:43 -07:00
Zachary Hampton 1010c743b6 - agent mls set and nrds id 2024-08-23 10:47:45 -07:00
Zachary Hampton 32fdc281e3 - rewrote & optimized flow
- new_construction data point
- renamed "agent" & "broker" to "agent_name" & "broker_name"
- added builder & office data
- added entity uuids
2024-08-20 05:19:15 -07:00
14 changed files with 954 additions and 737 deletions

1
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1 @@
github: Bunsly

View File

@ -2,10 +2,6 @@
**HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings. **HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings.
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
## HomeHarvest Features ## HomeHarvest Features
- **Source**: Fetches properties directly from **Realtor.com**. - **Source**: Fetches properties directly from **Realtor.com**.
@ -40,6 +36,7 @@ properties = scrape_property(
listing_type="sold", # or (for_sale, for_rent, pending) listing_type="sold", # or (for_sale, for_rent, pending)
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent) past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
# property_type=['single_family','multi_family'],
# date_from="2023-05-01", # alternative to past_days # date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28", # date_to="2023-05-28",
# foreclosure=True # foreclosure=True
@ -68,13 +65,30 @@ print(properties.head())
``` ```
Required Required
├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc. ├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.
── listing_type (option): Choose the type of listing. ── listing_type (option): Choose the type of listing.
- 'for_rent' - 'for_rent'
- 'for_sale' - 'for_sale'
- 'sold' - 'sold'
- 'pending' - 'pending' (for pending/contingent sales)
Optional Optional
├── property_type (list): Choose the type of properties.
- 'single_family'
- 'multi_family'
- 'condos'
- 'condo_townhome_rowhome_coop'
- 'condo_townhome'
- 'townhomes'
- 'duplex_triplex'
- 'farm'
- 'land'
- 'mobile'
├── return_type (option): Choose the return type.
│ - 'pandas' (default)
│ - 'pydantic'
│ - 'raw' (json)
├── radius (decimal): Radius in miles to find comparable properties based on individual addresses. ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
│ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored) │ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
@ -92,9 +106,9 @@ Optional
├── proxy (string): In format 'http://user:pass@host:port' ├── proxy (string): In format 'http://user:pass@host:port'
├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) ├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data for general searches (e.g. schools, tax appraisals etc.)
├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' ├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending'
└── limit (integer): Limit the number of properties to fetch. Max & default is 10000. └── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
``` ```
@ -104,6 +118,8 @@ Optional
Property Property
├── Basic Information: ├── Basic Information:
│ ├── property_url │ ├── property_url
│ ├── property_id
│ ├── listing_id
│ ├── mls │ ├── mls
│ ├── mls_id │ ├── mls_id
│ └── status │ └── status
@ -123,6 +139,7 @@ Property
│ ├── sqft │ ├── sqft
│ ├── year_built │ ├── year_built
│ ├── stories │ ├── stories
│ ├── garage
│ └── lot_sqft │ └── lot_sqft
├── Property Listing Details: ├── Property Listing Details:
@ -135,29 +152,47 @@ Property
│ ├── sold_price │ ├── sold_price
│ ├── last_sold_date │ ├── last_sold_date
│ ├── price_per_sqft │ ├── price_per_sqft
│ ├── parking_garage │ ├── new_construction
│ └── hoa_fee │ └── hoa_fee
├── Tax Information:
│ ├── year
│ ├── tax
│ ├── assessment
│ │ ├── building
│ │ ├── land
│ │ └── total
├── Location Details: ├── Location Details:
│ ├── latitude │ ├── latitude
│ ├── longitude │ ├── longitude
│ ├── nearby_schools │ ├── nearby_schools
├── Agent Info: ├── Agent Info:
│ ├── agent │ ├── agent_id
│ ├── agent_name
│ ├── agent_email │ ├── agent_email
│ └── agent_phone │ └── agent_phone
├── Broker Info: ├── Broker Info:
│ ├── broker │ ├── broker_id
│ ├── broker_email │ └── broker_name
│ └── broker_website
├── Builder Info:
│ ├── builder_id
│ └── builder_name
├── Office Info:
│ ├── office_id
│ ├── office_name
│ ├── office_phones
│ └── office_email
``` ```
### Exceptions ### Exceptions
The following exceptions may be raised when using HomeHarvest: The following exceptions may be raised when using HomeHarvest:
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`, `pending`.
- `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD. - `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD.
- `AuthenticationError` - Realtor.com token request failed. - `AuthenticationError` - Realtor.com token request failed.

View File

@ -1,141 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "cb48903e-5021-49fe-9688-45cd0bc05d0f",
"metadata": {
"is_executing": true
},
"outputs": [],
"source": [
"from homeharvest import scrape_property\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "156488ce-0d5f-43c5-87f4-c33e9c427860",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None) # Show all columns\n",
"pd.set_option('display.max_rows', None) # Show all rows\n",
"pd.set_option('display.width', None) # Auto-adjust display width to fit console\n",
"pd.set_option('display.max_colwidth', 50) # Limit max column width to 50 characters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c8b9744-8606-4e9b-8add-b90371a249a7",
"metadata": {},
"outputs": [],
"source": [
"# check for sale properties\n",
"scrape_property(\n",
" location=\"dallas\",\n",
" listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aaf86093",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# search a specific address\n",
"scrape_property(\n",
" location=\"2530 Al Lipscomb Way\",\n",
" listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab7b4c21-da1d-4713-9df4-d7425d8ce21e",
"metadata": {},
"outputs": [],
"source": [
"# check rentals\n",
"scrape_property(\n",
" location=\"chicago, illinois\",\n",
" listing_type=\"for_rent\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af280cd3",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# check sold properties\n",
"properties = scrape_property(\n",
" location=\"90210\",\n",
" listing_type=\"sold\",\n",
" past_days=10\n",
")\n",
"display(properties)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "628c1ce2",
"metadata": {
"collapsed": false,
"is_executing": true,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# display clickable URLs\n",
"from IPython.display import display, HTML\n",
"properties['property_url'] = '<a href=\"' + properties['property_url'] + '\" target=\"_blank\">' + properties['property_url'] + '</a>'\n",
"\n",
"html = properties.to_html(escape=False)\n",
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
"display(HTML(truncate_width))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,20 +0,0 @@
from homeharvest import scrape_property
from datetime import datetime
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
# pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
# mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
)
print(f"Number of properties: {len(properties)}")
# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())

104
examples/price_of_land.py Normal file
View File

@ -0,0 +1,104 @@
"""
This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files.
It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code.
"""
import os
import pandas as pd
from homeharvest import scrape_property
def get_property_details(zip: str, listing_type):
properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365)
if not properties.empty:
properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None)
properties = properties[properties["sqft"].isnull()]
properties["ppa"] = properties.apply(
lambda row: (
int(
(
row["sold_price"]
if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD")
else row["list_price"]
)
/ row["lot_acres"]
)
if pd.notnull(row["lot_acres"])
and row["lot_acres"] > 0
and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"]))
else None
),
axis=1,
)
properties["ppa"] = properties["ppa"].astype("Int64")
selected_columns = [
"property_url",
"property_id",
"style",
"status",
"street",
"city",
"state",
"zip_code",
"county",
"list_date",
"last_sold_date",
"list_price",
"sold_price",
"lot_sqft",
"lot_acres",
"ppa",
]
properties = properties[selected_columns]
return properties
def output_to_excel(zip_code, sold_df, pending_df):
root_folder = os.getcwd()
zip_folder = os.path.join(root_folder, "zips", zip_code)
# Create zip code folder if it doesn't exist
os.makedirs(zip_folder, exist_ok=True)
# Define file paths
sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx")
pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx")
# Save individual sold and pending files
sold_df.to_excel(sold_file, index=False)
pending_df.to_excel(pending_file, index=False)
zip_codes = map(
str,
[
22920,
77024,
78028,
24553,
22967,
22971,
22922,
22958,
22969,
22949,
22938,
24599,
24562,
22976,
24464,
22964,
24581,
],
)
combined_df = pd.DataFrame()
for zip in zip_codes:
sold_df = get_property_details(zip, "sold")
pending_df = get_property_details(zip, "pending")
combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True)
output_to_excel(zip, sold_df, pending_df)
combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx")
combined_df.to_excel(combined_file, index=False)

View File

@ -3,12 +3,14 @@ import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property
def scrape_property( def scrape_property(
location: str, location: str,
listing_type: str = "for_sale", listing_type: str = "for_sale",
return_type: str = "pandas",
property_type: list[str] | None = None,
radius: float = None, radius: float = None,
mls_only: bool = False, mls_only: bool = False,
past_days: int = None, past_days: int = None,
@ -18,12 +20,14 @@ def scrape_property(
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False, exclude_pending: bool = False,
limit: int = 10000, limit: int = 10000
) -> pd.DataFrame: ) -> pd.DataFrame | list[dict] | list[Property]:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold, pending) :param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param return_type: Return type (pandas, pydantic, raw)
:param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
:param proxy: Proxy to use for scraping :param proxy: Proxy to use for scraping
@ -40,7 +44,9 @@ def scrape_property(
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType[listing_type.upper()], listing_type=ListingType(listing_type.upper()),
return_type=ReturnType(return_type.lower()),
property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
proxy=proxy, proxy=proxy,
radius=radius, radius=radius,
mls_only=mls_only, mls_only=mls_only,
@ -56,6 +62,9 @@ def scrape_property(
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
results = site.search() results = site.search()
if scraper_input.return_type != ReturnType.pandas:
return results
properties_dfs = [df for result in results if not (df := process_result(result)).empty] properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
@ -63,4 +72,6 @@ def scrape_property(
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA}) return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace(
{"None": pd.NA, None: pd.NA, "": pd.NA}
)

View File

@ -1,11 +1,13 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from typing import Union
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
import json import json
@ -13,6 +15,7 @@ import json
class ScraperInput: class ScraperInput:
location: str location: str
listing_type: ListingType listing_type: ListingType
property_type: list[SearchPropertyType] | None = None
radius: float | None = None radius: float | None = None
mls_only: bool | None = False mls_only: bool | None = False
proxy: str | None = None proxy: str | None = None
@ -23,6 +26,7 @@ class ScraperInput:
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False exclude_pending: bool | None = False
limit: int = 10000 limit: int = 10000
return_type: ReturnType = ReturnType.pandas
class Scraper: class Scraper:
@ -34,11 +38,12 @@ class Scraper:
): ):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.property_type = scraper_input.property_type
if not self.session: if not self.session:
Scraper.session = requests.Session() Scraper.session = requests.Session()
retries = Retry( retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"]) total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
) )
adapter = HTTPAdapter(max_retries=retries) adapter = HTTPAdapter(max_retries=retries)
@ -46,8 +51,21 @@ class Scraper:
Scraper.session.mount("https://", adapter) Scraper.session.mount("https://", adapter)
Scraper.session.headers.update( Scraper.session.headers.update(
{ {
"auth": f"Bearer {self.get_access_token()}", "accept": "application/json, text/javascript",
"apollographql-client-name": "com.move.Realtor-apollo-ios", "accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"content-type": "application/json",
"origin": "https://www.realtor.com",
"pragma": "no-cache",
"priority": "u=1, i",
"rdc-ab-tests": "commute_travel_time_variation:v1",
"sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
} }
) )
@ -66,8 +84,9 @@ class Scraper:
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit self.limit = scraper_input.limit
self.return_type = scraper_input.return_type
def search(self) -> list[Property]: ... def search(self) -> list[Union[Property | dict]]: ...
@staticmethod @staticmethod
def _parse_home(home) -> Property: ... def _parse_home(home) -> Property: ...
@ -81,27 +100,29 @@ class Scraper:
response = requests.post( response = requests.post(
"https://graph.realtor.com/auth/token", "https://graph.realtor.com/auth/token",
headers={ headers={
'Host': 'graph.realtor.com', "Host": "graph.realtor.com",
'Accept': '*/*', "Accept": "*/*",
'Content-Type': 'Application/json', "Content-Type": "Application/json",
'X-Client-ID': 'rdc_mobile_native,iphone', "X-Client-ID": "rdc_mobile_native,iphone",
'X-Visitor-ID': device_id, "X-Visitor-ID": device_id,
'X-Client-Version': '24.21.23.679885', "X-Client-Version": "24.21.23.679885",
'Accept-Language': 'en-US,en;q=0.9', "Accept-Language": "en-US,en;q=0.9",
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0', "User-Agent": "Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0",
}, },
data=json.dumps({ data=json.dumps(
"grant_type": "device_mobile", {
"device_id": device_id, "grant_type": "device_mobile",
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone" "device_id": device_id,
})) "client_app_id": "rdc_mobile_native,24.21.23.679885,iphone",
}
),
)
data = response.json() data = response.json()
if not (access_token := data.get("access_token")): if not (access_token := data.get("access_token")):
raise AuthenticationError( raise AuthenticationError(
"Failed to get access token, use a proxy/vpn or wait a moment and try again.", "Failed to get access token, use a proxy/vpn or wait a moment and try again.", response=response
response=response
) )
return access_token return access_token

View File

@ -4,6 +4,12 @@ from enum import Enum
from typing import Optional from typing import Optional
class ReturnType(Enum):
pydantic = "pydantic"
pandas = "pandas"
raw = "raw"
class SiteName(Enum): class SiteName(Enum):
ZILLOW = "zillow" ZILLOW = "zillow"
REDFIN = "redfin" REDFIN = "redfin"
@ -17,6 +23,20 @@ class SiteName(Enum):
raise ValueError(f"{value} not found in {cls}") raise ValueError(f"{value} not found in {cls}")
class SearchPropertyType(Enum):
SINGLE_FAMILY = "single_family"
APARTMENT = "apartment"
CONDOS = "condos"
CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop"
CONDO_TOWNHOME = "condo_townhome"
TOWNHOMES = "townhomes"
DUPLEX_TRIPLEX = "duplex_triplex"
FARM = "farm"
LAND = "land"
MULTI_FAMILY = "multi_family"
MOBILE = "mobile"
class ListingType(Enum): class ListingType(Enum):
FOR_SALE = "FOR_SALE" FOR_SALE = "FOR_SALE"
FOR_RENT = "FOR_RENT" FOR_RENT = "FOR_RENT"
@ -90,23 +110,56 @@ class AgentPhone: #: For documentation purposes only (at the moment)
@dataclass @dataclass
class Agent: class Entity:
name: str | None = None name: str
uuid: str | None = None
@dataclass
class Agent(Entity):
mls_set: str | None = None
nrds_id: str | None = None
phones: list[dict] | AgentPhone | None = None phones: list[dict] | AgentPhone | None = None
email: str | None = None email: str | None = None
href: str | None = None href: str | None = None
@dataclass @dataclass
class Broker: class Office(Entity):
name: str | None = None mls_set: str | None = None
phone: str | None = None email: str | None = None
website: str | None = None href: str | None = None
phones: list[dict] | AgentPhone | None = None
@dataclass
class Broker(Entity):
pass
@dataclass
class Builder(Entity):
pass
@dataclass
class Advertisers:
agent: Agent | None = None
broker: Broker | None = None
builder: Builder | None = None
office: Office | None = None
@dataclass @dataclass
class Property: class Property:
property_url: str property_url: str
property_id: str
#: allows_cats: bool
#: allows_dogs: bool
listing_id: str | None = None
mls: str | None = None mls: str | None = None
mls_id: str | None = None mls_id: str | None = None
status: str | None = None status: str | None = None
@ -120,17 +173,22 @@ class Property:
pending_date: str | None = None pending_date: str | None = None
last_sold_date: str | None = None last_sold_date: str | None = None
prc_sqft: int | None = None prc_sqft: int | None = None
new_construction: bool | None = None
hoa_fee: int | None = None hoa_fee: int | None = None
days_on_mls: int | None = None days_on_mls: int | None = None
description: Description | None = None description: Description | None = None
tags: list[str] | None = None
details: list[dict] | None = None
latitude: float | None = None latitude: float | None = None
longitude: float | None = None longitude: float | None = None
neighborhoods: Optional[str] = None neighborhoods: Optional[str] = None
county: Optional[str] = None county: Optional[str] = None
fips_code: Optional[str] = None fips_code: Optional[str] = None
agents: list[Agent] | None = None
brokers: list[Broker] | None = None
nearby_schools: list[str] = None nearby_schools: list[str] = None
assessed_value: int | None = None assessed_value: int | None = None
estimated_value: int | None = None estimated_value: int | None = None
tax: int | None = None
tax_history: list[dict] | None = None
advertisers: Advertisers | None = None

View File

@ -6,12 +6,35 @@ This module implements the scraper for realtor.com
""" """
from __future__ import annotations from __future__ import annotations
import json
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from json import JSONDecodeError
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from tenacity import (
retry,
retry_if_exception_type,
wait_exponential,
stop_after_attempt,
)
from .. import Scraper from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker from ..models import (
Property,
Address,
ListingType,
Description,
PropertyType,
Agent,
Broker,
Builder,
Advertisers,
Office,
ReturnType
)
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
@ -46,155 +69,6 @@ class RealtorScraper(Scraper):
return result[0] return result[0]
def handle_listing(self, listing_id: str) -> list[Property]:
query = """query Listing($listing_id: ID!) {
listing(id: $listing_id) {
source {
id
listing_id
}
address {
line
street_direction
street_number
street_name
street_suffix
unit
city
state_code
postal_code
location {
coordinate {
lat
lon
}
}
}
basic {
sqft
beds
baths_full
baths_half
lot_sqft
sold_price
type
price
status
sold_date
list_date
}
details {
year_built
stories
garage
permalink
}
media {
photos {
href
}
}
}
}"""
variables = {"listing_id": listing_id}
payload = {
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
property_info = response_json["data"]["listing"]
mls = (
property_info["source"].get("id")
if "source" in property_info and isinstance(property_info["source"], dict)
else None
)
able_to_get_lat_long = (
property_info
and property_info.get("address")
and property_info["address"].get("location")
and property_info["address"]["location"].get("coordinate")
)
list_date_str = (
property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None
)
last_sold_date_str = (
property_info["basic"]["sold_date"].split("T")[0] if property_info["basic"].get("sold_date") else None
)
pending_date_str = property_info["pending_date"].split("T")[0] if property_info.get("pending_date") else None
list_date = datetime.strptime(list_date_str, "%Y-%m-%d") if list_date_str else None
last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None
pending_date = datetime.strptime(pending_date_str, "%Y-%m-%d") if pending_date_str else None
today = datetime.now()
days_on_mls = None
status = property_info["basic"]["status"].lower()
if list_date:
if status == "sold" and last_sold_date:
days_on_mls = (last_sold_date - list_date).days
elif status in ("for_sale", "for_rent"):
days_on_mls = (today - list_date).days
if days_on_mls and days_on_mls < 0:
days_on_mls = None
property_id = property_info["details"]["permalink"]
prop_details = self.get_prop_details(property_id)
style = property_info["basic"].get("type", "").upper()
listing = Property(
mls=mls,
mls_id=(
property_info["source"].get("listing_id")
if "source" in property_info and isinstance(property_info["source"], dict)
else None
),
property_url=f"{self.PROPERTY_URL}{property_id}",
status=property_info["basic"]["status"].upper(),
list_price=property_info["basic"]["price"],
list_date=list_date,
prc_sqft=(
property_info["basic"].get("price") / property_info["basic"].get("sqft")
if property_info["basic"].get("price") and property_info["basic"].get("sqft")
else None
),
last_sold_date=last_sold_date,
pending_date=pending_date,
latitude=property_info["address"]["location"]["coordinate"].get("lat") if able_to_get_lat_long else None,
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(property_info, search_type="handle_listing"),
description=Description(
alt_photos=(
self.process_alt_photos(property_info["media"].get("photos", []))
if property_info.get("media")
else None
),
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"),
baths_half=property_info["basic"].get("baths_half"),
sqft=property_info["basic"].get("sqft"),
lot_sqft=property_info["basic"].get("lot_sqft"),
sold_price=property_info["basic"].get("sold_price"),
year_built=property_info["details"].get("year_built"),
garage=property_info["details"].get("garage"),
stories=property_info["details"].get("stories"),
text=property_info.get("description", {}).get("text"),
),
days_on_mls=days_on_mls,
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
)
return [listing]
def get_latest_listing_id(self, property_id: str) -> str | None: def get_latest_listing_id(self, property_id: str) -> str | None:
query = """query Property($property_id: ID!) { query = """query Property($property_id: ID!) {
property(id: $property_id) { property(id: $property_id) {
@ -228,65 +102,15 @@ class RealtorScraper(Scraper):
else: else:
return property_info["listings"][0]["listing_id"] return property_info["listings"][0]["listing_id"]
def handle_address(self, property_id: str) -> list[Property]: def handle_home(self, property_id: str) -> list[Property]:
""" query = (
Handles a specific address & returns one property """query Home($property_id: ID!) {
""" home(property_id: $property_id) %s
query = """query Property($property_id: ID!) {
property(id: $property_id) {
property_id
details {
date_updated
garage
permalink
year_built
stories
}
address {
line
street_direction
street_number
street_name
street_suffix
unit
city
state_code
postal_code
location {
coordinate {
lat
lon
}
}
}
basic {
baths
beds
price
sqft
lot_sqft
type
sold_price
}
public_record {
lot_size
sqft
stories
units
year_built
}
primary_photo {
href
}
photos {
href
}
}
}""" }"""
% HOMES_DATA
)
variables = {"property_id": property_id} variables = {"property_id": property_id}
prop_details = self.get_prop_details(property_id)
payload = { payload = {
"query": query, "query": query,
"variables": variables, "variables": variables,
@ -295,103 +119,123 @@ class RealtorScraper(Scraper):
response = self.session.post(self.SEARCH_GQL_URL, json=payload) response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json() response_json = response.json()
property_info = response_json["data"]["property"] property_info = response_json["data"]["home"]
return [ return [self.process_property(property_info)]
Property(
mls_id=property_id,
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info),
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
)
]
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]: @staticmethod
def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None:
if not advertisers:
return None
def _parse_fulfillment_id(fulfillment_id: str | None) -> str | None:
return fulfillment_id if fulfillment_id and fulfillment_id != "0" else None
processed_advertisers = Advertisers()
for advertiser in advertisers:
advertiser_type = advertiser.get("type")
if advertiser_type == "seller": #: agent
processed_advertisers.agent = Agent(
uuid=_parse_fulfillment_id(advertiser.get("fulfillment_id")),
nrds_id=advertiser.get("nrds_id"),
mls_set=advertiser.get("mls_set"),
name=advertiser.get("name"),
email=advertiser.get("email"),
phones=advertiser.get("phones"),
)
if advertiser.get("broker") and advertiser["broker"].get("name"): #: has a broker
processed_advertisers.broker = Broker(
uuid=_parse_fulfillment_id(advertiser["broker"].get("fulfillment_id")),
name=advertiser["broker"].get("name"),
)
if advertiser.get("office"): #: has an office
processed_advertisers.office = Office(
uuid=_parse_fulfillment_id(advertiser["office"].get("fulfillment_id")),
mls_set=advertiser["office"].get("mls_set"),
name=advertiser["office"].get("name"),
email=advertiser["office"].get("email"),
phones=advertiser["office"].get("phones"),
)
if advertiser_type == "community": #: could be builder
if advertiser.get("builder"):
processed_advertisers.builder = Builder(
uuid=_parse_fulfillment_id(advertiser["builder"].get("fulfillment_id")),
name=advertiser["builder"].get("name"),
)
return processed_advertisers
def process_property(self, result: dict) -> Property | None:
mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None
if not mls and self.mls_only:
return
able_to_get_lat_long = (
result
and result.get("location")
and result["location"].get("address")
and result["location"]["address"].get("coordinate")
)
is_pending = result["flags"].get("is_pending")
is_contingent = result["flags"].get("is_contingent")
if (is_pending or is_contingent) and (self.exclude_pending and self.listing_type != ListingType.PENDING):
return
property_id = result["property_id"]
prop_details = self.process_extra_property_details(result) if self.extra_property_data else {}
property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues")
estimated_value = self.get_key(property_estimates_root, [0, "estimate"])
advertisers = self.process_advertisers(result.get("advertisers"))
realty_property = Property(
mls=mls,
mls_id=(
result["source"].get("listing_id")
if "source" in result and isinstance(result["source"], dict)
else None
),
property_url=result["href"],
property_id=property_id,
listing_id=result.get("listing_id"),
status=("PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper()),
list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=(result["list_date"].split("T")[0] if result.get("list_date") else None),
prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"),
new_construction=result["flags"].get("is_new_construction") is True,
hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None),
latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None),
longitude=(result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None),
address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result),
county=(result["location"]["county"].get("name") if result["location"]["county"] else None),
fips_code=(result["location"]["county"].get("fips_code") if result["location"]["county"] else None),
days_on_mls=self.calculate_days_on_mls(result),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=estimated_value if estimated_value else None,
advertisers=advertisers,
tax=prop_details.get("tax"),
tax_history=prop_details.get("tax_history"),
)
return realty_property
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]:
""" """
Handles a location area & returns a list of properties Handles a location area & returns a list of properties
""" """
results_query = """{
count
total
results {
pending_date
property_id
list_date
status
last_sold_price
last_sold_date
list_price
list_price_max
list_price_min
price_per_sqft
flags {
is_contingent
is_pending
}
description {
type
sqft
beds
baths_full
baths_half
lot_sqft
sold_price
year_built
garage
sold_price
type
name
stories
text
}
source {
id
listing_id
}
hoa {
fee
}
location {
address {
street_direction
street_number
street_name
street_suffix
line
unit
city
state_code
postal_code
coordinate {
lon
lat
}
}
county {
name
fips_code
}
neighborhoods {
name
}
}
tax_record {
public_record_id
}
primary_photo {
href
}
photos {
href
}
}
}
}"""
date_param = "" date_param = ""
if self.listing_type == ListingType.SOLD: if self.listing_type == ListingType.SOLD:
@ -405,10 +249,15 @@ class RealtorScraper(Scraper):
elif self.last_x_days: elif self.last_x_days:
date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}' date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}'
property_type_param = ""
if self.property_type:
property_types = [pt.value for pt in self.property_type]
property_type_param = f"type: {json.dumps(property_types)}"
sort_param = ( sort_param = (
"sort: [{ field: sold_date, direction: desc }]" "sort: [{ field: sold_date, direction: desc }]"
if self.listing_type == ListingType.SOLD if self.listing_type == ListingType.SOLD
else "sort: [{ field: list_date, direction: desc }]" else "" #: "sort: [{ field: list_date, direction: desc }]" #: prioritize normal fractal sort from realtor
) )
pending_or_contingent_param = ( pending_or_contingent_param = (
@ -439,62 +288,65 @@ class RealtorScraper(Scraper):
status: %s status: %s
%s %s
%s %s
%s
} }
%s %s
limit: 200 limit: 200
offset: $offset offset: $offset
) %s""" % ( ) %s
}""" % (
is_foreclosure, is_foreclosure,
listing_type.value.lower(), listing_type.value.lower(),
date_param, date_param,
property_type_param,
pending_or_contingent_param, pending_or_contingent_param,
sort_param, sort_param,
results_query, GENERAL_RESULTS_QUERY,
) )
elif search_type == "area": #: general search, came from a general location elif search_type == "area": #: general search, came from a general location
query = """query Home_search( query = """query Home_search(
$city: String, $location: String!,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int, $offset: Int,
) { ) {
home_search( home_search(
query: { query: {
%s %s
city: $city search_location: {location: $location}
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s status: %s
unique: true
%s
%s %s
%s %s
} }
bucket: { sort: "fractal_v1.1.3_fr" }
%s %s
limit: 200 limit: 200
offset: $offset offset: $offset
) %s""" % ( ) %s
}""" % (
is_foreclosure, is_foreclosure,
listing_type.value.lower(), listing_type.value.lower(),
date_param, date_param,
property_type_param,
pending_or_contingent_param, pending_or_contingent_param,
sort_param, sort_param,
results_query, GENERAL_RESULTS_QUERY,
) )
else: #: general search, came from an address else: #: general search, came from an address
query = ( query = (
"""query Property_search( """query Property_search(
$property_id: [ID]! $property_id: [ID]!
$offset: Int!, $offset: Int!,
) { ) {
property_search( home_search(
query: { query: {
property_id: $property_id property_id: $property_id
} }
limit: 1 limit: 1
offset: $offset offset: $offset
) %s""" ) %s
% results_query }"""
% GENERAL_RESULTS_QUERY
) )
payload = { payload = {
@ -506,92 +358,43 @@ class RealtorScraper(Scraper):
response_json = response.json() response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search" search_key = "home_search" if "home_search" in query else "property_search"
properties: list[Property] = [] properties: list[Union[Property, dict]] = []
if ( if (
response_json is None response_json is None
or "data" not in response_json or "data" not in response_json
or response_json["data"] is None or response_json["data"] is None
or search_key not in response_json["data"] or search_key not in response_json["data"]
or response_json["data"][search_key] is None or response_json["data"][search_key] is None
or "results" not in response_json["data"][search_key] or "results" not in response_json["data"][search_key]
): ):
return {"total": 0, "properties": []} return {"total": 0, "properties": []}
def process_property(result: dict) -> Property | None:
mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None
if not mls and self.mls_only:
return
able_to_get_lat_long = (
result
and result.get("location")
and result["location"].get("address")
and result["location"]["address"].get("coordinate")
)
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING):
return
property_id = result["property_id"]
prop_details = self.get_prop_details(property_id) if self.extra_property_data else {}
realty_property = Property(
mls=mls,
mls_id=(
result["source"].get("listing_id")
if "source" in result and isinstance(result["source"], dict)
else None
),
property_url=(
f"{self.PROPERTY_URL}{property_id}"
if self.listing_type != ListingType.FOR_RENT
else f"{self.PROPERTY_URL}M{property_id}?listing_status=rental"
),
status="PENDING" if is_pending else result["status"].upper(),
list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"),
hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None,
latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None,
longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result),
county=result["location"]["county"].get("name") if result["location"]["county"] else None,
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result),
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
)
return realty_property
properties_list = response_json["data"][search_key]["results"] properties_list = response_json["data"][search_key]["results"]
total_properties = response_json["data"][search_key]["total"] total_properties = response_json["data"][search_key]["total"]
offset = variables.get("offset", 0) offset = variables.get("offset", 0)
#: limit the number of properties to be processed #: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50 #: example, if your offset is 200, and your limit is 250, return 50
properties_list = properties_list[:self.limit - offset] properties_list: list[dict] = properties_list[: self.limit - offset]
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: if self.extra_property_data:
futures = [ property_ids = [data["property_id"] for data in properties_list]
executor.submit(process_property, result) for result in properties_list extra_property_details = self.get_bulk_prop_details(property_ids) or {}
]
for future in as_completed(futures): for result in properties_list:
result = future.result() result.update(extra_property_details.get(result["property_id"], {}))
if result:
properties.append(result) if self.return_type != ReturnType.raw:
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [executor.submit(self.process_property, result) for result in properties_list]
for future in as_completed(futures):
result = future.result()
if result:
properties.append(result)
else:
properties = properties_list
return { return {
"total": total_properties, "total": total_properties,
@ -617,17 +420,7 @@ class RealtorScraper(Scraper):
if location_type == "address": if location_type == "address":
if not self.radius: #: single address search, non comps if not self.radius: #: single address search, non comps
property_id = location_info["mpr_id"] property_id = location_info["mpr_id"]
search_variables |= {"property_id": property_id} return self.handle_home(property_id)
gql_results = self.general_search(search_variables, search_type=search_type)
if gql_results["total"] == 0:
listing_id = self.get_latest_listing_id(property_id)
if listing_id is None:
return self.handle_address(property_id)
else:
return self.handle_listing(listing_id)
else:
return gql_results["properties"]
else: #: general search, comps (radius) else: #: general search, comps (radius)
if not location_info.get("centroid"): if not location_info.get("centroid"):
@ -646,10 +439,7 @@ class RealtorScraper(Scraper):
else: #: general search, location else: #: general search, location
search_variables |= { search_variables |= {
"city": location_info.get("city"), "location": self.location,
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
} }
if self.foreclosure: if self.foreclosure:
@ -666,7 +456,11 @@ class RealtorScraper(Scraper):
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
search_type=search_type, search_type=search_type,
) )
for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE) for i in range(
self.DEFAULT_PAGE_SIZE,
min(total, self.limit),
self.DEFAULT_PAGE_SIZE,
)
] ]
for future in as_completed(futures): for future in as_completed(futures):
@ -674,88 +468,90 @@ class RealtorScraper(Scraper):
return homes return homes
def get_prop_details(self, property_id: str) -> dict: @staticmethod
if not self.extra_property_data: def get_key(data: dict, keys: list):
try:
value = data
for key in keys:
value = value[key]
return value or {}
except (KeyError, TypeError, IndexError):
return {} return {}
#: TODO: migrate "advertisers" and "estimates" to general query def process_extra_property_details(self, result: dict) -> dict:
schools = self.get_key(result, ["nearbySchools", "schools"])
query = """query GetHome($property_id: ID!) { assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"])
home(property_id: $property_id) { tax_history = self.get_key(result, ["taxHistory"])
__typename
advertisers {
__typename
type
name
email
phones { number type ext primary }
}
consumer_advertisers {
name
phone
href
type
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
estimates {
__typename
currentValues: current_values {
__typename
source { __typename type name }
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
}
}"""
variables = {"property_id": property_id}
response = self.session.post(self.PROPERTY_GQL, json={"query": query, "variables": variables})
data = response.json()
def get_key(keys: list):
try:
value = data
for key in keys:
value = value[key]
return value or {}
except (KeyError, TypeError, IndexError):
return {}
agents = get_key(["data", "home", "advertisers"])
advertisers = get_key(["data", "home", "consumer_advertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
agents = [Agent(name=ad["name"], email=ad["email"], phones=ad["phones"]) for ad in agents]
brokers = [
Broker(name=ad["name"], phone=ad["phone"], website=ad["href"])
for ad in advertisers
if ad.get("type") != "Agent"
]
schools = [school["district"]["name"] for school in schools if school["district"].get("name")] schools = [school["district"]["name"] for school in schools if school["district"].get("name")]
# Process tax history
latest_tax = None
processed_tax_history = None
if tax_history and isinstance(tax_history, list):
tax_history = sorted(tax_history, key=lambda x: x.get("year", 0), reverse=True)
if tax_history and "tax" in tax_history[0]:
latest_tax = tax_history[0]["tax"]
processed_tax_history = []
for entry in tax_history:
if "year" in entry and "tax" in entry:
processed_entry = {
"year": entry["year"],
"tax": entry["tax"],
}
if "assessment" in entry and isinstance(entry["assessment"], dict):
processed_entry["assessment"] = {
"building": entry["assessment"].get("building"),
"land": entry["assessment"].get("land"),
"total": entry["assessment"].get("total"),
}
processed_tax_history.append(processed_entry)
return { return {
"agents": agents if agents else None,
"brokers": brokers if brokers else None,
"schools": schools if schools else None, "schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None, "assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None, "tax": latest_tax,
"tax_history": processed_tax_history,
} }
@retry(
retry=retry_if_exception_type(JSONDecodeError),
wait=wait_exponential(min=4, max=10),
stop=stop_after_attempt(3),
)
def get_bulk_prop_details(self, property_ids: list[str]) -> dict:
"""
Fetch extra property details for multiple properties in a single GraphQL query.
Returns a map of property_id to its details.
"""
if not self.extra_property_data or not property_ids:
return {}
property_ids = list(set(property_ids))
# Construct the bulk query
fragments = "\n".join(
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}'
for property_id in property_ids
)
query = f"""{HOME_FRAGMENT}
query GetHomes {{
{fragments}
}}"""
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query})
data = response.json()
if "data" not in data:
return {}
properties = data["data"]
return {data.replace('home_', ''): properties[data] for data in properties if properties[data]}
@staticmethod @staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]: def _parse_neighborhoods(result: dict) -> Optional[str]:
neighborhoods_list = [] neighborhoods_list = []
@ -816,20 +612,22 @@ class RealtorScraper(Scraper):
style = style.upper() style = style.upper()
primary_photo = "" primary_photo = ""
if (primary_photo_info := result.get('primary_photo')) and (primary_photo_href := primary_photo_info.get("href")): if (primary_photo_info := result.get("primary_photo")) and (
primary_photo_href := primary_photo_info.get("href")
):
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])), alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None, style=(PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None),
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),
sqft=description_data.get("sqft"), sqft=description_data.get("sqft"),
lot_sqft=description_data.get("lot_sqft"), lot_sqft=description_data.get("lot_sqft"),
sold_price=( sold_price=(
description_data.get("sold_price") result.get("last_sold_price") or description_data.get("sold_price")
if result.get("last_sold_date") or result["list_price"] != description_data.get("sold_price") if result.get("last_sold_date") or result["list_price"] != description_data.get("sold_price")
else None else None
), #: has a sold date or list and sold price are different ), #: has a sold date or list and sold price are different
@ -859,14 +657,12 @@ class RealtorScraper(Scraper):
return days return days
@staticmethod @staticmethod
def process_alt_photos(photos_info): def process_alt_photos(photos_info: list[dict]) -> list[str] | None:
try: if not photos_info:
alt_photos = [] return None
if photos_info:
for photo_info in photos_info: return [
href = photo_info.get("href", "") photo_info["href"].replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
alt_photo_href = href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") for photo_info in photos_info
alt_photos.append(alt_photo_href) if photo_info.get("href")
return alt_photos ]
except Exception:
pass

View File

@ -0,0 +1,242 @@
_SEARCH_HOMES_DATA_BASE = """{
pending_date
listing_id
property_id
href
list_date
status
last_sold_price
last_sold_date
list_price
list_price_max
list_price_min
price_per_sqft
tags
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
list_price
__typename
}
flags {
is_contingent
is_pending
is_new_construction
}
description {
type
sqft
beds
baths_full
baths_half
lot_sqft
year_built
garage
type
name
stories
text
}
source {
id
listing_id
}
hoa {
fee
}
location {
address {
street_direction
street_number
street_name
street_suffix
line
unit
city
state_code
postal_code
coordinate {
lon
lat
}
}
county {
name
fips_code
}
neighborhoods {
name
}
}
tax_record {
public_record_id
}
primary_photo(https: true) {
href
}
photos(https: true) {
href
tags {
label
}
}
advertisers {
email
broker {
name
fulfillment_id
}
type
name
fulfillment_id
builder {
name
fulfillment_id
}
phones {
ext
primary
type
number
}
office {
name
email
fulfillment_id
href
phones {
number
type
primary
ext
}
mls_set
}
corporation {
specialties
name
bio
href
fulfillment_id
}
mls_set
nrds_id
rental_corporation {
fulfillment_id
}
rental_management {
name
href
fulfillment_id
}
}
"""
HOME_FRAGMENT = """
fragment HomeData on Home {
property_id
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
}
"""
HOMES_DATA = """%s
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
estimates {
__typename
currentValues: current_values {
__typename
source { __typename type name }
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
}""" % _SEARCH_HOMES_DATA_BASE
SEARCH_HOMES_DATA = """%s
current_estimates {
__typename
source {
__typename
type
name
}
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}""" % _SEARCH_HOMES_DATA_BASE
GENERAL_RESULTS_QUERY = """{
count
total
results %s
}""" % SEARCH_HOMES_DATA

View File

@ -1,11 +1,13 @@
from __future__ import annotations from __future__ import annotations
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType, Agent from .core.scrapers.models import Property, ListingType, Advertisers
from .exceptions import InvalidListingType, InvalidDate from .exceptions import InvalidListingType, InvalidDate
ordered_properties = [ ordered_properties = [
"property_url", "property_url",
"property_id",
"listing_id",
"mls", "mls",
"mls_id", "mls_id",
"status", "status",
@ -31,6 +33,9 @@ ordered_properties = [
"last_sold_date", "last_sold_date",
"assessed_value", "assessed_value",
"estimated_value", "estimated_value",
"tax",
"tax_history",
"new_construction",
"lot_sqft", "lot_sqft",
"price_per_sqft", "price_per_sqft",
"latitude", "latitude",
@ -41,12 +46,21 @@ ordered_properties = [
"stories", "stories",
"hoa_fee", "hoa_fee",
"parking_garage", "parking_garage",
"agent", "agent_id",
"agent_name",
"agent_email", "agent_email",
"agent_phones", "agent_phones",
"broker", "agent_mls_set",
"broker_phone", "agent_nrds_id",
"broker_website", "broker_id",
"broker_name",
"builder_id",
"builder_name",
"office_id",
"office_mls_set",
"office_name",
"office_email",
"office_phones",
"nearby_schools", "nearby_schools",
"primary_photo", "primary_photo",
"alt_photos", "alt_photos",
@ -66,19 +80,34 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["state"] = address_data.state prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip prop_data["zip_code"] = address_data.zip
if "agents" in prop_data: if "advertisers" in prop_data and prop_data.get("advertisers"):
agents: list[Agent] | None = prop_data["agents"] advertiser_data: Advertisers | None = prop_data["advertisers"]
if agents: if advertiser_data.agent:
prop_data["agent"] = agents[0].name agent_data = advertiser_data.agent
prop_data["agent_email"] = agents[0].email prop_data["agent_id"] = agent_data.uuid
prop_data["agent_phones"] = agents[0].phones prop_data["agent_name"] = agent_data.name
prop_data["agent_email"] = agent_data.email
prop_data["agent_phones"] = agent_data.phones
prop_data["agent_mls_set"] = agent_data.mls_set
prop_data["agent_nrds_id"] = agent_data.nrds_id
if "brokers" in prop_data: if advertiser_data.broker:
brokers = prop_data["brokers"] broker_data = advertiser_data.broker
if brokers: prop_data["broker_id"] = broker_data.uuid
prop_data["broker"] = brokers[0].name prop_data["broker_name"] = broker_data.name
prop_data["broker_phone"] = brokers[0].phone
prop_data["broker_website"] = brokers[0].website if advertiser_data.builder:
builder_data = advertiser_data.builder
prop_data["builder_id"] = builder_data.uuid
prop_data["builder_name"] = builder_data.name
if advertiser_data.office:
office_data = advertiser_data.office
prop_data["office_id"] = office_data.uuid
prop_data["office_name"] = office_data.name
prop_data["office_email"] = office_data.email
prop_data["office_phones"] = office_data.phones
prop_data["office_mls_set"] = office_data.mls_set
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
@ -88,8 +117,11 @@ def process_result(result: Property) -> pd.DataFrame:
if description: if description:
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if isinstance(description.style, prop_data["style"] = (
str) else description.style.value if description.style else None description.style
if isinstance(description.style, str)
else description.style.value if description.style else None
)
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half

19
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]] [[package]]
name = "annotated-types" name = "annotated-types"
@ -667,6 +667,21 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
] ]
[[package]]
name = "tenacity"
version = "9.0.0"
description = "Retry code until it succeeds"
optional = false
python-versions = ">=3.8"
files = [
{file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
{file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
]
[package.extras]
doc = ["reno", "sphinx"]
test = ["pytest", "tornado (>=4.5)", "typeguard"]
[[package]] [[package]]
name = "tomli" name = "tomli"
version = "2.0.1" version = "2.0.1"
@ -740,4 +755,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.9,<3.13" python-versions = ">=3.9,<3.13"
content-hash = "21ef9cfb35c446a375a2b74c37691d7031afb1e4f66a8b63cb7c1669470689d2" content-hash = "cefc11b1bf5ad99d628f6d08f6f03003522cc1b6e48b519230d99d716a5c165c"

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.34" version = "0.4.7"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
@ -14,6 +14,7 @@ python = ">=3.9,<3.13"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.1" pandas = "^2.1.1"
pydantic = "^2.7.4" pydantic = "^2.7.4"
tenacity = "^9.0.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@ -1,4 +1,5 @@
from homeharvest import scrape_property from homeharvest import scrape_property, Property
import pandas as pd
def test_realtor_pending_or_contingent(): def test_realtor_pending_or_contingent():
@ -105,8 +106,12 @@ def test_realtor():
location="2530 Al Lipscomb Way", location="2530 Al Lipscomb Way",
listing_type="for_sale", listing_type="for_sale",
), ),
scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format scrape_property(
scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format location="Phoenix, AZ", listing_type="for_rent", limit=1000
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", listing_type="sold", limit=1000
), #: does not support "city, state, USA" format
scrape_property(location="85281"), scrape_property(location="85281"),
] ]
@ -114,11 +119,13 @@ def test_realtor():
def test_realtor_city(): def test_realtor_city():
results = scrape_property( results = scrape_property(location="Atlanta, GA", listing_type="for_sale", limit=1000)
location="Atlanta, GA",
listing_type="for_sale", assert results is not None and len(results) > 0
limit=1000
)
def test_realtor_land():
results = scrape_property(location="Atlanta, GA", listing_type="for_sale", property_type=["land"], limit=1000)
assert results is not None and len(results) > 0 assert results is not None and len(results) > 0
@ -128,6 +135,7 @@ def test_realtor_bad_address():
location="abceefg ju098ot498hh9", location="abceefg ju098ot498hh9",
listing_type="for_sale", listing_type="for_sale",
) )
if len(bad_results) == 0: if len(bad_results) == 0:
assert True assert True
@ -141,18 +149,23 @@ def test_realtor_foreclosed():
def test_realtor_agent(): def test_realtor_agent():
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000) scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000, extra_property_data=False)
assert scraped["agent"].nunique() > 1 assert scraped["agent_name"].nunique() > 1
def test_realtor_without_extra_details(): def test_realtor_without_extra_details():
results = [ results = [
scrape_property( scrape_property(
location="15509 N 172nd Dr, Surprise, AZ 85388", location="00741",
listing_type="sold",
limit=10,
extra_property_data=False, extra_property_data=False,
), ),
scrape_property( scrape_property(
location="15509 N 172nd Dr, Surprise, AZ 85388", location="00741",
listing_type="sold",
limit=10,
extra_property_data=True,
), ),
] ]
@ -235,6 +248,55 @@ def test_apartment_list_price():
results = results[results["style"] == "APARTMENT"] results = results[results["style"] == "APARTMENT"]
#: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max #: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max
assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len( assert (
results len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(results)
) > 0.5 > 0.5
)
def test_builder_exists():
listing = scrape_property(
location="18149 W Poston Dr, Surprise, AZ 85387",
extra_property_data=False,
)
assert listing is not None
assert listing["builder_name"].nunique() > 0
def test_phone_number_matching():
searches = [
scrape_property(
location="Phoenix, AZ",
listing_type="for_sale",
limit=100,
),
scrape_property(
location="Phoenix, AZ",
listing_type="for_sale",
limit=100,
),
]
assert all([search is not None for search in searches])
#: random row
row = searches[0][searches[0]["agent_phones"].notnull()].sample()
#: find matching row
matching_row = searches[1].loc[searches[1]["property_url"] == row["property_url"].values[0]]
#: assert phone numbers are the same
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
def test_return_type():
results = {
"pandas": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100),
"pydantic": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="pydantic"),
"raw": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="raw"),
}
assert isinstance(results["pandas"], pd.DataFrame)
assert isinstance(results["pydantic"][0], Property)
assert isinstance(results["raw"][0], dict)