Compare commits

..

8 Commits

Author SHA1 Message Date
Zachary Hampton
3579c10196 Merge pull request #147 from ZacharyHampton/feature/ios-mobile-headers
Improve API stability and reliability
2025-12-05 19:30:25 -08:00
Zachary Hampton
f5784e0191 Update to iOS mobile app headers for improved API stability
- Replace browser-based headers with iOS mobile app headers
- Update GraphQL query names to match iOS app conventions (1:1 alignment)
- Add _graphql_post() wrapper to centralize GraphQL calls with dynamic operation names
- Simplify session management by removing unnecessary thread-local complexity
- Add test_parallel_search_consistency test to verify concurrent request stability
- Bump version from 0.8.6b to 0.8.7

Changes fix API flakiness under concurrent load - parallel consistency test now passes 100% (5/5 runs).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 19:27:47 -08:00
Zachary Hampton
57093f5d17 Merge pull request #145 from ZacharyHampton/fix/realtor-403-error
Fix 403 error from Realtor.com API changes
2025-12-04 23:10:32 -08:00
zacharyhampton
406ff97260 - version bump 2025-12-04 23:08:37 -08:00
zacharyhampton
a8c9d0fd66 Replace REST autocomplete with GraphQL Search_suggestions query
- Replace /suggest REST endpoint with GraphQL Search_suggestions query
- Use search_location field instead of individual city/county/state/postal_code fields
- Fix coordinate order to [lon, lat] (GeoJSON standard) for radius searches
- Extract mpr_id from addr: prefix for single address lookups

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 21:08:01 -08:00
Zachary Hampton
0b283e18bd Fix 403 error from Realtor.com API changes
- Update GraphQL endpoint to api.frontdoor.realtor.com
- Update HTTP headers with newer Chrome version and correct client name/version
- Improve error handling in handle_home method
- Fix response validation for missing/null data

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 18:56:10 -08:00
Zachary Hampton
8bf1f9e24b Add regression test for listing_type=None including sold listings
Adds test_listing_type_none_includes_sold() to verify that when listing_type=None, sold listings are included in the results. This prevents regression of issue #142.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 13:38:48 -08:00
Zachary Hampton
79b2b648f5 Fix sold listings not included when listing_type=None (issue #142)
When listing_type=None, sold listings were excluded despite documentation stating all types should be returned. This fix includes two changes:

1. Explicitly include common listing types (for_sale, for_rent, sold, pending, off_market) when listing_type=None instead of sending empty status parameter
2. Fix or_filters logic to only apply for PENDING when not mixed with other types like SOLD, preventing unintended filtering

Updated README documentation to accurately reflect that None returns common listing types rather than all 8 types.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 13:30:54 -08:00
5 changed files with 207 additions and 106 deletions

View File

@@ -84,7 +84,7 @@ properties = scrape_property(
#### Sorting & Listing Types
```py
# Sort options: list_price, list_date, sqft, beds, baths, last_update_date
# Listing types: "for_sale", "for_rent", "sold", "pending", list, or None (all)
# Listing types: "for_sale", "for_rent", "sold", "pending", "off_market", list, or None (common types)
properties = scrape_property(
location="Miami, FL",
listing_type=["for_sale", "pending"], # Single string, list, or None
@@ -158,7 +158,7 @@ Required
│ - 'other'
│ - 'ready_to_build'
│ - List of strings returns properties matching ANY status: ['for_sale', 'pending']
│ - None returns all listing types
│ - None returns common listing types (for_sale, for_rent, sold, pending, off_market)
Optional
├── property_type (list): Choose the type of properties.

View File

@@ -76,26 +76,22 @@ class Scraper:
total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries)
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
"accept": "application/json, text/javascript",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"content-type": "application/json",
"origin": "https://www.realtor.com",
"pragma": "no-cache",
"priority": "u=1, i",
"rdc-ab-tests": "commute_travel_time_variation:v1",
"sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
'Host': 'api.frontdoor.realtor.com',
'rdc-ab-test-client': 'ios_for_sale',
'Content-Type': 'application/json',
'apollographql-client-version': '26.9.25-26.9.25.0774600',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'rdc-client-version': '26.9.25',
'X-APOLLO-OPERATION-TYPE': 'query',
'rdc-client-name': 'RDC_NATIVE_MOBILE-iPhone-com.move.Realtor',
'apollographql-client-name': 'com.move.Realtor-apollo-ios',
'User-Agent': 'Realtor.com/26.9.25.0774600 CFNetwork/3860.200.71 Darwin/25.1.0',
}
)

View File

@@ -35,47 +35,109 @@ from .processors import (
class RealtorScraper(Scraper):
SEARCH_GQL_URL = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
PROPERTY_URL = "https://www.realtor.com/realestateandhomes-detail/"
PROPERTY_GQL = "https://graph.realtor.com/graphql"
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
SEARCH_GQL_URL = "https://api.frontdoor.realtor.com/graphql"
NUM_PROPERTY_WORKERS = 20
DEFAULT_PAGE_SIZE = 200
def __init__(self, scraper_input):
super().__init__(scraper_input)
def handle_location(self):
# Get client_id from listing_type
if self.listing_type is None:
client_id = "for-sale"
elif isinstance(self.listing_type, list):
client_id = self.listing_type[0].value.lower().replace("_", "-") if self.listing_type else "for-sale"
else:
client_id = self.listing_type.value.lower().replace("_", "-")
def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict:
"""
Execute a GraphQL query with operation-specific headers.
params = {
"input": self.location,
"client_id": client_id,
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
Args:
query: GraphQL query string (must include operationName matching operation_name param)
variables: Query variables dictionary
operation_name: Name of the GraphQL operation for Apollo headers
Returns:
Response JSON dictionary
"""
# Set operation-specific header (must match query's operationName)
self.session.headers['X-APOLLO-OPERATION-NAME'] = operation_name
payload = {
"operationName": operation_name, # Include in payload
"query": query,
"variables": variables,
}
response = self.session.get(
self.ADDRESS_AUTOCOMPLETE_URL,
params=params,
)
response_json = response.json()
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
return response.json()
result = response_json["autocomplete"]
@retry(
retry=retry_if_exception_type(Exception),
wait=wait_exponential(multiplier=1, min=1, max=4),
stop=stop_after_attempt(3),
)
def handle_location(self):
query = """query SearchSuggestions($searchInput: SearchSuggestionsInput!) {
search_suggestions(search_input: $searchInput) {
geo_results {
type
text
geo {
_id
area_type
city
state_code
postal_code
county
centroid { lat lon }
slug_id
geo_id
}
}
}
}"""
if not result:
variables = {
"searchInput": {
"search_term": self.location
}
}
response_json = self._graphql_post(query, variables, "SearchSuggestions")
if (
response_json is None
or "data" not in response_json
or response_json["data"] is None
or "search_suggestions" not in response_json["data"]
or response_json["data"]["search_suggestions"] is None
or "geo_results" not in response_json["data"]["search_suggestions"]
or not response_json["data"]["search_suggestions"]["geo_results"]
):
# If we got a 400 error with "Required parameter is missing", raise to trigger retry
if response_json and "errors" in response_json:
error_msgs = [e.get("message", "") for e in response_json.get("errors", [])]
if any("Required parameter is missing" in msg for msg in error_msgs):
raise Exception(f"Transient API error: {error_msgs}")
return None
return result[0]
geo_result = response_json["data"]["search_suggestions"]["geo_results"][0]
geo = geo_result.get("geo", {})
result = {
"text": geo_result.get("text"),
"area_type": geo.get("area_type"),
"city": geo.get("city"),
"state_code": geo.get("state_code"),
"postal_code": geo.get("postal_code"),
"county": geo.get("county"),
"centroid": geo.get("centroid"),
}
if geo.get("area_type") == "address":
geo_id = geo.get("_id", "")
if geo_id.startswith("addr:"):
result["mpr_id"] = geo_id.replace("addr:", "")
return result
def get_latest_listing_id(self, property_id: str) -> str | None:
query = """query Property($property_id: ID!) {
query = """query GetPropertyListingId($property_id: ID!) {
property(id: $property_id) {
listings {
listing_id
@@ -86,13 +148,7 @@ class RealtorScraper(Scraper):
"""
variables = {"property_id": property_id}
payload = {
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
response_json = self._graphql_post(query, variables, "GetPropertyListingId")
property_info = response_json["data"]["property"]
if property_info["listings"] is None:
@@ -108,31 +164,40 @@ class RealtorScraper(Scraper):
return property_info["listings"][0]["listing_id"]
def handle_home(self, property_id: str) -> list[Property]:
"""Fetch single home with proper error handling."""
query = (
"""query Home($property_id: ID!) {
"""query GetHomeDetails($property_id: ID!) {
home(property_id: $property_id) %s
}"""
% HOMES_DATA
)
variables = {"property_id": property_id}
payload = {
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
try:
data = self._graphql_post(query, variables, "GetHomeDetails")
property_info = response_json["data"]["home"]
# Check for errors or missing data
if "errors" in data or "data" not in data:
return []
if self.return_type != ReturnType.raw:
return [process_property(property_info, self.mls_only, self.extra_property_data,
self.exclude_pending, self.listing_type, get_key, process_extra_property_details)]
else:
return [property_info]
if data["data"] is None or "home" not in data["data"]:
return []
property_info = data["data"]["home"]
if property_info is None:
return []
# Process based on return type
if self.return_type != ReturnType.raw:
return [process_property(property_info, self.mls_only, self.extra_property_data,
self.exclude_pending, self.listing_type, get_key,
process_extra_property_details)]
else:
return [property_info]
except Exception:
return []
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]:
"""
@@ -144,7 +209,15 @@ class RealtorScraper(Scraper):
# Determine date field based on listing type
# Convert listing_type to list for uniform handling
if self.listing_type is None:
listing_types = []
# When None, return all common listing types as documented
# Note: NEW_COMMUNITY, OTHER, and READY_TO_BUILD are excluded as they typically return no results
listing_types = [
ListingType.FOR_SALE,
ListingType.FOR_RENT,
ListingType.SOLD,
ListingType.PENDING,
ListingType.OFF_MARKET,
]
date_field = None # When no listing_type is specified, skip date filtering
elif isinstance(self.listing_type, list):
listing_types = self.listing_type
@@ -277,10 +350,14 @@ class RealtorScraper(Scraper):
else:
sort_param = "" #: prioritize normal fractal sort from realtor
# Handle PENDING with or_filters (applies if PENDING is in the list or is the single type)
# Handle PENDING with or_filters
# Only use or_filters when PENDING is the only type or mixed only with FOR_SALE
# Using or_filters with other types (SOLD, FOR_RENT, etc.) will exclude those types
has_pending = ListingType.PENDING in listing_types
other_types = [lt for lt in listing_types if lt not in [ListingType.PENDING, ListingType.FOR_SALE]]
use_or_filters = has_pending and len(other_types) == 0
pending_or_contingent_param = (
"or_filters: { contingent: true, pending: true }" if has_pending else ""
"or_filters: { contingent: true, pending: true }" if use_or_filters else ""
)
# Build bucket parameter (only use fractal sort if no custom sort is specified)
@@ -317,7 +394,7 @@ class RealtorScraper(Scraper):
is_foreclosure = "foreclosure: false"
if search_type == "comps": #: comps search, came from an address
query = """query Property_search(
query = """query GetHomeSearch(
$coordinates: [Float]!
$radius: String!
$offset: Int!,
@@ -350,20 +427,14 @@ class RealtorScraper(Scraper):
GENERAL_RESULTS_QUERY,
)
elif search_type == "area": #: general search, came from a general location
query = """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
query = """query GetHomeSearch(
$search_location: SearchLocation,
$offset: Int,
) {
home_search(
query: {
%s
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
search_location: $search_location
%s
%s
%s
@@ -388,7 +459,7 @@ class RealtorScraper(Scraper):
)
else: #: general search, came from an address
query = (
"""query Property_search(
"""query GetHomeSearch(
$property_id: [ID]!
$offset: Int!,
) {
@@ -403,13 +474,7 @@ class RealtorScraper(Scraper):
% GENERAL_RESULTS_QUERY
)
payload = {
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
response_json = self._graphql_post(query, variables, "GetHomeSearch")
search_key = "home_search" if "home_search" in query else "property_search"
properties: list[Union[Property, dict]] = []
@@ -499,24 +564,16 @@ class RealtorScraper(Scraper):
if not location_info.get("centroid"):
return []
coordinates = list(location_info["centroid"].values())
centroid = location_info["centroid"]
coordinates = [centroid["lon"], centroid["lat"]] # GeoJSON order: [lon, lat]
search_variables |= {
"coordinates": coordinates,
"radius": "{}mi".format(self.radius),
}
elif location_type == "postal_code":
else: #: general search (city, county, postal_code, etc.)
search_variables |= {
"postal_code": location_info.get("postal_code"),
}
else: #: general search, location
search_variables |= {
"city": location_info.get("city"),
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
"search_location": {"location": location_info.get("text")},
}
if self.foreclosure:
@@ -1038,8 +1095,8 @@ class RealtorScraper(Scraper):
@retry(
retry=retry_if_exception_type(JSONDecodeError),
wait=wait_exponential(min=4, max=10),
retry=retry_if_exception_type((JSONDecodeError, Exception)),
wait=wait_exponential(multiplier=1, min=1, max=10),
stop=stop_after_attempt(3),
)
def get_bulk_prop_details(self, property_ids: list[str]) -> dict:
@@ -1058,15 +1115,19 @@ class RealtorScraper(Scraper):
for property_id in property_ids
)
query = f"""{HOME_FRAGMENT}
query GetHomes {{
{fragments}
}}"""
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query})
data = response.json()
query GetBulkPropertyDetails {{
{fragments}
}}"""
data = self._graphql_post(query, {}, "GetBulkPropertyDetails")
if "data" not in data:
# If we got a 400 error with "Required parameter is missing", raise to trigger retry
if data and "errors" in data:
error_msgs = [e.get("message", "") for e in data.get("errors", [])]
if any("Required parameter is missing" in msg for msg in error_msgs):
raise Exception(f"Transient API error: {error_msgs}")
return {}
properties = data["data"]

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.8.4"
version = "0.8.7"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@@ -1,4 +1,5 @@
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
from homeharvest import scrape_property, Property
import pandas as pd
@@ -87,6 +88,25 @@ def test_realtor_date_range_sold():
)
def test_listing_type_none_includes_sold():
"""Test that listing_type=None includes sold listings (issue #142)"""
# Get properties with listing_type=None (should include all common types)
result_none = scrape_property(
location="Warren, MI",
listing_type=None
)
# Verify we got results
assert result_none is not None and len(result_none) > 0
# Verify sold listings are included
status_types = set(result_none['status'].unique())
assert 'SOLD' in status_types, "SOLD listings should be included when listing_type=None"
# Verify we get multiple listing types (not just one)
assert len(status_types) > 1, "Should return multiple listing types when listing_type=None"
def test_realtor_single_property():
results = [
scrape_property(
@@ -288,6 +308,30 @@ def test_phone_number_matching():
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
def test_parallel_search_consistency():
"""Test that the same search executed 3 times in parallel returns consistent results"""
def search_task():
return scrape_property(
location="Phoenix, AZ",
listing_type="for_sale",
limit=100
)
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(search_task) for _ in range(3)]
results = [future.result() for future in as_completed(futures)]
# Verify all results are valid
assert all([result is not None for result in results])
assert all([isinstance(result, pd.DataFrame) for result in results])
assert all([len(result) > 0 for result in results])
# Verify all results have the same length (primary consistency check)
lengths = [len(result) for result in results]
assert len(set(lengths)) == 1, \
f"All parallel searches should return same number of results, got lengths: {lengths}"
def test_return_type():
results = {
"pandas": [scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100)],