Compare commits

...

9 Commits

Author SHA1 Message Date
zacharyhampton
fefacdd264 Version bump to 0.8.8
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 17:32:06 -08:00
Zachary Hampton
3579c10196 Merge pull request #147 from ZacharyHampton/feature/ios-mobile-headers
Improve API stability and reliability
2025-12-05 19:30:25 -08:00
Zachary Hampton
f5784e0191 Update to iOS mobile app headers for improved API stability
- Replace browser-based headers with iOS mobile app headers
- Update GraphQL query names to match iOS app conventions (1:1 alignment)
- Add _graphql_post() wrapper to centralize GraphQL calls with dynamic operation names
- Simplify session management by removing unnecessary thread-local complexity
- Add test_parallel_search_consistency test to verify concurrent request stability
- Bump version from 0.8.6b to 0.8.7

Changes fix API flakiness under concurrent load - parallel consistency test now passes 100% (5/5 runs).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 19:27:47 -08:00
Zachary Hampton
57093f5d17 Merge pull request #145 from ZacharyHampton/fix/realtor-403-error
Fix 403 error from Realtor.com API changes
2025-12-04 23:10:32 -08:00
zacharyhampton
406ff97260 - version bump 2025-12-04 23:08:37 -08:00
zacharyhampton
a8c9d0fd66 Replace REST autocomplete with GraphQL Search_suggestions query
- Replace /suggest REST endpoint with GraphQL Search_suggestions query
- Use search_location field instead of individual city/county/state/postal_code fields
- Fix coordinate order to [lon, lat] (GeoJSON standard) for radius searches
- Extract mpr_id from addr: prefix for single address lookups

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 21:08:01 -08:00
Zachary Hampton
0b283e18bd Fix 403 error from Realtor.com API changes
- Update GraphQL endpoint to api.frontdoor.realtor.com
- Update HTTP headers with newer Chrome version and correct client name/version
- Improve error handling in handle_home method
- Fix response validation for missing/null data

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 18:56:10 -08:00
Zachary Hampton
8bf1f9e24b Add regression test for listing_type=None including sold listings
Adds test_listing_type_none_includes_sold() to verify that when listing_type=None, sold listings are included in the results. This prevents regression of issue #142.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 13:38:48 -08:00
Zachary Hampton
79b2b648f5 Fix sold listings not included when listing_type=None (issue #142)
When listing_type=None, sold listings were excluded despite documentation stating all types should be returned. This fix includes two changes:

1. Explicitly include common listing types (for_sale, for_rent, sold, pending, off_market) when listing_type=None instead of sending empty status parameter
2. Fix or_filters logic to only apply for PENDING when not mixed with other types like SOLD, preventing unintended filtering

Updated README documentation to accurately reflect that None returns common listing types rather than all 8 types.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 13:30:54 -08:00
6 changed files with 446 additions and 120 deletions

View File

@@ -84,7 +84,7 @@ properties = scrape_property(
#### Sorting & Listing Types
```py
# Sort options: list_price, list_date, sqft, beds, baths, last_update_date
# Listing types: "for_sale", "for_rent", "sold", "pending", list, or None (all)
# Listing types: "for_sale", "for_rent", "sold", "pending", "off_market", list, or None (common types)
properties = scrape_property(
location="Miami, FL",
listing_type=["for_sale", "pending"], # Single string, list, or None
@@ -158,7 +158,7 @@ Required
│ - 'other'
│ - 'ready_to_build'
│ - List of strings returns properties matching ANY status: ['for_sale', 'pending']
│ - None returns all listing types
│ - None returns common listing types (for_sale, for_rent, sold, pending, off_market)
Optional
├── property_type (list): Choose the type of properties.

View File

@@ -76,26 +76,24 @@ class Scraper:
total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries)
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
"accept": "application/json, text/javascript",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"content-type": "application/json",
"origin": "https://www.realtor.com",
"pragma": "no-cache",
"priority": "u=1, i",
"rdc-ab-tests": "commute_travel_time_variation:v1",
"sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
'Host': 'api.frontdoor.realtor.com',
'rdc-ab-test-client': 'ios_for_sale',
'Content-Type': 'application/json',
'apollographql-client-version': '26.9.25-26.9.25.0774600',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'rdc-client-version': '26.9.25',
'X-APOLLO-OPERATION-TYPE': 'query',
'rdc-client-name': 'RDC_NATIVE_MOBILE-iPhone-com.move.Realtor',
'apollographql-client-name': 'com.move.Realtor-apollo-ios',
'newrelic': '',
'transparent': '',
'User-Agent': 'Realtor.com/26.9.25.0774600 CFNetwork/3860.200.71 Darwin/25.1.0',
}
)

View File

@@ -26,7 +26,7 @@ from ..models import (
ListingType,
ReturnType
)
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT, SEARCH_RESULTS_FRAGMENT
from .processors import (
process_property,
process_extra_property_details,
@@ -35,64 +35,128 @@ from .processors import (
class RealtorScraper(Scraper):
SEARCH_GQL_URL = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
PROPERTY_URL = "https://www.realtor.com/realestateandhomes-detail/"
PROPERTY_GQL = "https://graph.realtor.com/graphql"
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
SEARCH_GQL_URL = "https://api.frontdoor.realtor.com/graphql"
NUM_PROPERTY_WORKERS = 20
DEFAULT_PAGE_SIZE = 200
def __init__(self, scraper_input):
super().__init__(scraper_input)
def handle_location(self):
# Get client_id from listing_type
if self.listing_type is None:
client_id = "for-sale"
elif isinstance(self.listing_type, list):
client_id = self.listing_type[0].value.lower().replace("_", "-") if self.listing_type else "for-sale"
else:
client_id = self.listing_type.value.lower().replace("_", "-")
def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict:
"""
Execute a GraphQL query with operation-specific headers.
params = {
"input": self.location,
"client_id": client_id,
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
Args:
query: GraphQL query string (must include operationName matching operation_name param)
variables: Query variables dictionary
operation_name: Name of the GraphQL operation for Apollo headers
Returns:
Response JSON dictionary
"""
# Set operation-specific header (must match query's operationName)
self.session.headers['X-APOLLO-OPERATION-NAME'] = operation_name
payload = {
"operationName": operation_name, # Include in payload
"query": query,
"variables": variables,
}
response = self.session.get(
self.ADDRESS_AUTOCOMPLETE_URL,
params=params,
)
response_json = response.json()
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
return response.json()
result = response_json["autocomplete"]
@retry(
retry=retry_if_exception_type(Exception),
wait=wait_exponential(multiplier=1, min=1, max=4),
stop=stop_after_attempt(3),
)
def handle_location(self):
query = """
fragment SuggestionFragment on SearchSuggestionGeoResult {
type
text
geo {
_id
area_type
city
state_code
postal_code
county
centroid { lat lon }
slug_id
geo_id
}
}
query SearchSuggestions($searchInput: SearchSuggestionsInput!) {
search_suggestions(search_input: $searchInput) {
geo_results {
...SuggestionFragment
}
}
}"""
if not result:
variables = {
"searchInput": {
"search_term": self.location
}
}
response_json = self._graphql_post(query, variables, "SearchSuggestions")
if (
response_json is None
or "data" not in response_json
or response_json["data"] is None
or "search_suggestions" not in response_json["data"]
or response_json["data"]["search_suggestions"] is None
or "geo_results" not in response_json["data"]["search_suggestions"]
or not response_json["data"]["search_suggestions"]["geo_results"]
):
# If we got a 400 error with "Required parameter is missing", raise to trigger retry
if response_json and "errors" in response_json:
error_msgs = [e.get("message", "") for e in response_json.get("errors", [])]
if any("Required parameter is missing" in msg for msg in error_msgs):
raise Exception(f"Transient API error: {error_msgs}")
return None
return result[0]
geo_result = response_json["data"]["search_suggestions"]["geo_results"][0]
geo = geo_result.get("geo", {})
result = {
"text": geo_result.get("text"),
"area_type": geo.get("area_type"),
"city": geo.get("city"),
"state_code": geo.get("state_code"),
"postal_code": geo.get("postal_code"),
"county": geo.get("county"),
"centroid": geo.get("centroid"),
}
if geo.get("area_type") == "address":
geo_id = geo.get("_id", "")
if geo_id.startswith("addr:"):
result["mpr_id"] = geo_id.replace("addr:", "")
return result
def get_latest_listing_id(self, property_id: str) -> str | None:
query = """query Property($property_id: ID!) {
query = """
fragment ListingFragment on Listing {
listing_id
primary
}
query GetPropertyListingId($property_id: ID!) {
property(id: $property_id) {
listings {
listing_id
primary
...ListingFragment
}
}
}
"""
variables = {"property_id": property_id}
payload = {
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
response_json = self._graphql_post(query, variables, "GetPropertyListingId")
property_info = response_json["data"]["property"]
if property_info["listings"] is None:
@@ -108,31 +172,43 @@ class RealtorScraper(Scraper):
return property_info["listings"][0]["listing_id"]
def handle_home(self, property_id: str) -> list[Property]:
"""Fetch single home with proper error handling."""
query = (
"""query Home($property_id: ID!) {
home(property_id: $property_id) %s
"""%s
query GetHomeDetails($property_id: ID!) {
home(property_id: $property_id) {
...HomeDetailsFragment
}
}"""
% HOMES_DATA
% HOME_FRAGMENT
)
variables = {"property_id": property_id}
payload = {
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
try:
data = self._graphql_post(query, variables, "GetHomeDetails")
property_info = response_json["data"]["home"]
# Check for errors or missing data
if "errors" in data or "data" not in data:
return []
if self.return_type != ReturnType.raw:
return [process_property(property_info, self.mls_only, self.extra_property_data,
self.exclude_pending, self.listing_type, get_key, process_extra_property_details)]
else:
return [property_info]
if data["data"] is None or "home" not in data["data"]:
return []
property_info = data["data"]["home"]
if property_info is None:
return []
# Process based on return type
if self.return_type != ReturnType.raw:
return [process_property(property_info, self.mls_only, self.extra_property_data,
self.exclude_pending, self.listing_type, get_key,
process_extra_property_details)]
else:
return [property_info]
except Exception:
return []
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]:
"""
@@ -144,7 +220,15 @@ class RealtorScraper(Scraper):
# Determine date field based on listing type
# Convert listing_type to list for uniform handling
if self.listing_type is None:
listing_types = []
# When None, return all common listing types as documented
# Note: NEW_COMMUNITY, OTHER, and READY_TO_BUILD are excluded as they typically return no results
listing_types = [
ListingType.FOR_SALE,
ListingType.FOR_RENT,
ListingType.SOLD,
ListingType.PENDING,
ListingType.OFF_MARKET,
]
date_field = None # When no listing_type is specified, skip date filtering
elif isinstance(self.listing_type, list):
listing_types = self.listing_type
@@ -277,10 +361,14 @@ class RealtorScraper(Scraper):
else:
sort_param = "" #: prioritize normal fractal sort from realtor
# Handle PENDING with or_filters (applies if PENDING is in the list or is the single type)
# Handle PENDING with or_filters
# Only use or_filters when PENDING is the only type or mixed only with FOR_SALE
# Using or_filters with other types (SOLD, FOR_RENT, etc.) will exclude those types
has_pending = ListingType.PENDING in listing_types
other_types = [lt for lt in listing_types if lt not in [ListingType.PENDING, ListingType.FOR_SALE]]
use_or_filters = has_pending and len(other_types) == 0
pending_or_contingent_param = (
"or_filters: { contingent: true, pending: true }" if has_pending else ""
"or_filters: { contingent: true, pending: true }" if use_or_filters else ""
)
# Build bucket parameter (only use fractal sort if no custom sort is specified)
@@ -317,12 +405,13 @@ class RealtorScraper(Scraper):
is_foreclosure = "foreclosure: false"
if search_type == "comps": #: comps search, came from an address
query = """query Property_search(
query = """%s
query GetHomeSearch(
$coordinates: [Float]!
$radius: String!
$offset: Int!,
) {
home_search(
homeSearch: home_search(
query: {
%s
nearby: {
@@ -340,6 +429,7 @@ class RealtorScraper(Scraper):
offset: $offset
) %s
}""" % (
SEARCH_RESULTS_FRAGMENT,
is_foreclosure,
status_param,
date_param,
@@ -350,20 +440,15 @@ class RealtorScraper(Scraper):
GENERAL_RESULTS_QUERY,
)
elif search_type == "area": #: general search, came from a general location
query = """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
query = """%s
query GetHomeSearch(
$search_location: SearchLocation,
$offset: Int,
) {
home_search(
homeSearch: home_search(
query: {
%s
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
search_location: $search_location
%s
%s
%s
@@ -376,6 +461,7 @@ class RealtorScraper(Scraper):
offset: $offset
) %s
}""" % (
SEARCH_RESULTS_FRAGMENT,
is_foreclosure,
status_param,
date_param,
@@ -388,11 +474,12 @@ class RealtorScraper(Scraper):
)
else: #: general search, came from an address
query = (
"""query Property_search(
"""%s
query GetHomeSearch(
$property_id: [ID]!
$offset: Int!,
) {
home_search(
homeSearch: home_search(
query: {
property_id: $property_id
}
@@ -400,17 +487,11 @@ class RealtorScraper(Scraper):
offset: $offset
) %s
}"""
% GENERAL_RESULTS_QUERY
% (SEARCH_RESULTS_FRAGMENT, GENERAL_RESULTS_QUERY)
)
payload = {
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search"
response_json = self._graphql_post(query, variables, "GetHomeSearch")
search_key = "homeSearch"
properties: list[Union[Property, dict]] = []
@@ -499,24 +580,16 @@ class RealtorScraper(Scraper):
if not location_info.get("centroid"):
return []
coordinates = list(location_info["centroid"].values())
centroid = location_info["centroid"]
coordinates = [centroid["lon"], centroid["lat"]] # GeoJSON order: [lon, lat]
search_variables |= {
"coordinates": coordinates,
"radius": "{}mi".format(self.radius),
}
elif location_type == "postal_code":
else: #: general search (city, county, postal_code, etc.)
search_variables |= {
"postal_code": location_info.get("postal_code"),
}
else: #: general search, location
search_variables |= {
"city": location_info.get("city"),
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
"search_location": {"location": location_info.get("text")},
}
if self.foreclosure:
@@ -1038,8 +1111,8 @@ class RealtorScraper(Scraper):
@retry(
retry=retry_if_exception_type(JSONDecodeError),
wait=wait_exponential(min=4, max=10),
retry=retry_if_exception_type((JSONDecodeError, Exception)),
wait=wait_exponential(multiplier=1, min=1, max=10),
stop=stop_after_attempt(3),
)
def get_bulk_prop_details(self, property_ids: list[str]) -> dict:
@@ -1052,21 +1125,24 @@ class RealtorScraper(Scraper):
property_ids = list(set(property_ids))
# Construct the bulk query
fragments = "\n".join(
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}'
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeDetailsFragment }}'
for property_id in property_ids
)
query = f"""{HOME_FRAGMENT}
query GetHomes {{
{fragments}
}}"""
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query})
data = response.json()
query GetHomeDetails {{
{fragments}
}}"""
data = self._graphql_post(query, {}, "GetHomeDetails")
if "data" not in data:
# If we got a 400 error with "Required parameter is missing", raise to trigger retry
if data and "errors" in data:
error_msgs = [e.get("message", "") for e in data.get("errors", [])]
if any("Required parameter is missing" in msg for msg in error_msgs):
raise Exception(f"Transient API error: {error_msgs}")
return {}
properties = data["data"]

View File

@@ -1,3 +1,200 @@
SEARCH_RESULTS_FRAGMENT = """
fragment SearchFragment on SearchHome {
__typename
pending_date
listing_id
property_id
href
permalink
list_date
status
mls_status
last_sold_price
last_sold_date
last_status_change_date
last_update_date
list_price
list_price_max
list_price_min
price_per_sqft
tags
open_houses {
start_date
end_date
description
time_zone
dst
href
methods
}
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
photos(https: true) {
title
href
tags {
label
}
}
list_price
__typename
}
flags {
is_contingent
is_pending
is_new_construction
}
description {
type
sqft
beds
baths_full
baths_half
lot_sqft
year_built
garage
type
name
stories
text
}
source {
id
listing_id
}
hoa {
fee
}
location {
address {
street_direction
street_number
street_name
street_suffix
line
unit
city
state_code
postal_code
coordinate {
lon
lat
}
}
county {
name
fips_code
}
neighborhoods {
name
}
}
tax_record {
cl_id
public_record_id
last_update_date
apn
tax_parcel_id
}
primary_photo(https: true) {
href
}
photos(https: true) {
title
href
tags {
label
}
}
advertisers {
email
broker {
name
fulfillment_id
}
type
name
fulfillment_id
builder {
name
fulfillment_id
}
phones {
ext
primary
type
number
}
office {
name
email
fulfillment_id
href
phones {
number
type
primary
ext
}
mls_set
}
corporation {
specialties
name
bio
href
fulfillment_id
}
mls_set
nrds_id
state_license
rental_corporation {
fulfillment_id
}
rental_management {
name
href
fulfillment_id
}
}
current_estimates {
__typename
source {
__typename
type
name
}
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
"""
_SEARCH_HOMES_DATA_BASE = """{
pending_date
listing_id
@@ -181,7 +378,7 @@ _SEARCH_HOMES_DATA_BASE = """{
HOME_FRAGMENT = """
fragment HomeData on Home {
fragment HomeDetailsFragment on Home {
property_id
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
@@ -300,8 +497,19 @@ current_estimates {
}
}""" % _SEARCH_HOMES_DATA_BASE
GENERAL_RESULTS_QUERY = """{
# Query body using inline fields (kept for backward compatibility)
GENERAL_RESULTS_QUERY_BODY = """{
count
total
results %s
}""" % SEARCH_HOMES_DATA
GENERAL_RESULTS_QUERY = """{
__typename
count
total
results {
__typename
...SearchFragment
}
}"""

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.8.4"
version = "0.8.8"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@@ -1,4 +1,5 @@
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
from homeharvest import scrape_property, Property
import pandas as pd
@@ -87,6 +88,25 @@ def test_realtor_date_range_sold():
)
def test_listing_type_none_includes_sold():
"""Test that listing_type=None includes sold listings (issue #142)"""
# Get properties with listing_type=None (should include all common types)
result_none = scrape_property(
location="Warren, MI",
listing_type=None
)
# Verify we got results
assert result_none is not None and len(result_none) > 0
# Verify sold listings are included
status_types = set(result_none['status'].unique())
assert 'SOLD' in status_types, "SOLD listings should be included when listing_type=None"
# Verify we get multiple listing types (not just one)
assert len(status_types) > 1, "Should return multiple listing types when listing_type=None"
def test_realtor_single_property():
results = [
scrape_property(
@@ -288,6 +308,30 @@ def test_phone_number_matching():
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
def test_parallel_search_consistency():
"""Test that the same search executed 3 times in parallel returns consistent results"""
def search_task():
return scrape_property(
location="Phoenix, AZ",
listing_type="for_sale",
limit=100
)
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(search_task) for _ in range(3)]
results = [future.result() for future in as_completed(futures)]
# Verify all results are valid
assert all([result is not None for result in results])
assert all([isinstance(result, pd.DataFrame) for result in results])
assert all([len(result) > 0 for result in results])
# Verify all results have the same length (primary consistency check)
lengths = [len(result) for result in results]
assert len(set(lengths)) == 1, \
f"All parallel searches should return same number of results, got lengths: {lengths}"
def test_return_type():
results = {
"pandas": [scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100)],