Compare commits

...

12 Commits

Author SHA1 Message Date
zacharyhampton
8a6ac96db4 Refactor scraper to use direct requests and bump to 0.8.18
- Replace session-based approach with direct requests calls
- Move headers to module-level DEFAULT_HEADERS constant
- Temporarily disable extra_property_data feature

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 00:29:53 -07:00
zacharyhampton
129ab37dff Version bump to 0.8.17
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 19:11:10 -07:00
zacharyhampton
9a0cac650e Version bump to 0.8.16 2025-12-21 16:22:03 -07:00
zacharyhampton
a1c1bcc822 Version bump to 0.8.15
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 16:03:57 -07:00
zacharyhampton
6f3faceb27 Version bump to 0.8.14
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 14:32:59 -07:00
zacharyhampton
cab0216f29 Version bump to 0.8.13
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 12:30:46 -07:00
zacharyhampton
8ee720ce5c Version bump to 0.8.12
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-20 15:30:26 -07:00
zacharyhampton
8eb138ee1a Version bump to 0.8.11
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 22:42:01 -07:00
Zachary Hampton
ef6db606fd Version bump to 0.8.10
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-16 18:32:33 -08:00
zacharyhampton
9406c92a66 Version bump to 0.8.9
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 17:55:33 -08:00
zacharyhampton
fefacdd264 Version bump to 0.8.8
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 17:32:06 -08:00
Zachary Hampton
3579c10196 Merge pull request #147 from ZacharyHampton/feature/ios-mobile-headers
Improve API stability and reliability
2025-12-05 19:30:25 -08:00
4 changed files with 589 additions and 96 deletions

View File

@@ -2,8 +2,6 @@ from __future__ import annotations
from typing import Union from typing import Union
import requests import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
@@ -11,6 +9,27 @@ import json
from pydantic import BaseModel from pydantic import BaseModel
DEFAULT_HEADERS = {
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Origin': 'https://www.realtor.com',
'Pragma': 'no-cache',
'Referer': 'https://www.realtor.com/',
'rdc-client-name': 'RDC_WEB_SRP_FS_PAGE',
'rdc-client-version': '3.0.2515',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
'x-is-bot': 'false',
}
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
location: str location: str
listing_type: ListingType | list[ListingType] | None listing_type: ListingType | list[ListingType] | None
@@ -60,8 +79,6 @@ class ScraperInput(BaseModel):
class Scraper: class Scraper:
session = None
def __init__( def __init__(
self, self,
scraper_input: ScraperInput, scraper_input: ScraperInput,
@@ -69,36 +86,8 @@ class Scraper:
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.property_type = scraper_input.property_type self.property_type = scraper_input.property_type
self.proxy = scraper_input.proxy
if not self.session: self.proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
Scraper.session = requests.Session()
retries = Retry(
total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
'Host': 'api.frontdoor.realtor.com',
'rdc-ab-test-client': 'ios_for_sale',
'Content-Type': 'application/json',
'apollographql-client-version': '26.9.25-26.9.25.0774600',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'rdc-client-version': '26.9.25',
'X-APOLLO-OPERATION-TYPE': 'query',
'rdc-client-name': 'RDC_NATIVE_MOBILE-iPhone-com.move.Realtor',
'apollographql-client-name': 'com.move.Realtor-apollo-ios',
'User-Agent': 'Realtor.com/26.9.25.0774600 CFNetwork/3860.200.71 Darwin/25.1.0',
}
)
if scraper_input.proxy:
proxy_url = scraper_input.proxy
proxies = {"http": proxy_url, "https": proxy_url}
self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.radius = scraper_input.radius self.radius = scraper_input.radius
@@ -109,7 +98,7 @@ class Scraper:
self.date_from_precision = scraper_input.date_from_precision self.date_from_precision = scraper_input.date_from_precision
self.date_to_precision = scraper_input.date_to_precision self.date_to_precision = scraper_input.date_to_precision
self.foreclosure = scraper_input.foreclosure self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = False # TODO: temporarily disabled
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit self.limit = scraper_input.limit
self.offset = scraper_input.offset self.offset = scraper_input.offset

View File

@@ -8,25 +8,27 @@ This module implements the scraper for realtor.com
from __future__ import annotations from __future__ import annotations
import json import json
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
from typing import Dict, Union from typing import Dict, Union
from tenacity import ( from tenacity import (
retry, retry,
retry_if_exception_type, retry_if_exception_type,
retry_if_not_exception_type,
wait_exponential, wait_exponential,
stop_after_attempt, stop_after_attempt,
) )
from .. import Scraper from .. import Scraper, DEFAULT_HEADERS
from ....exceptions import AuthenticationError
from ..models import ( from ..models import (
Property, Property,
ListingType, ListingType,
ReturnType ReturnType
) )
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT from .queries import GENERAL_RESULTS_QUERY, HOMES_DATA, SEARCH_SUGGESTIONS_QUERY
from .processors import ( from .processors import (
process_property, process_property,
process_extra_property_details, process_extra_property_details,
@@ -35,35 +37,53 @@ from .processors import (
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
SEARCH_GQL_URL = "https://api.frontdoor.realtor.com/graphql" SEARCH_GQL_URL = "https://www.realtor.com/frontdoor/graphql"
NUM_PROPERTY_WORKERS = 20 NUM_PROPERTY_WORKERS = 20
DEFAULT_PAGE_SIZE = 200 DEFAULT_PAGE_SIZE = 200
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
@staticmethod
def _minify_query(query: str) -> str:
"""Minify GraphQL query by collapsing whitespace to single spaces."""
# Split on whitespace, filter empty strings, join with single space
return ' '.join(query.split())
def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict: def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict:
""" """
Execute a GraphQL query with operation-specific headers. Execute a GraphQL query.
Args: Args:
query: GraphQL query string (must include operationName matching operation_name param) query: GraphQL query string (must include operationName matching operation_name param)
variables: Query variables dictionary variables: Query variables dictionary
operation_name: Name of the GraphQL operation for Apollo headers operation_name: Name of the GraphQL operation
Returns: Returns:
Response JSON dictionary Response JSON dictionary
""" """
# Set operation-specific header (must match query's operationName)
self.session.headers['X-APOLLO-OPERATION-NAME'] = operation_name
payload = { payload = {
"operationName": operation_name, # Include in payload "operationName": operation_name,
"query": query, "query": self._minify_query(query),
"variables": variables, "variables": variables,
} }
response = self.session.post(self.SEARCH_GQL_URL, json=payload) response = requests.post(
self.SEARCH_GQL_URL,
headers=DEFAULT_HEADERS,
data=json.dumps(payload, separators=(',', ':')),
proxies=self.proxies
)
if response.status_code == 403:
if not self.proxy:
raise AuthenticationError(
"Received 403 Forbidden from Realtor.com API.",
response=response
)
else:
raise Exception("Received 403 Forbidden, retrying...")
return response.json() return response.json()
@retry( @retry(
@@ -72,33 +92,13 @@ class RealtorScraper(Scraper):
stop=stop_after_attempt(3), stop=stop_after_attempt(3),
) )
def handle_location(self): def handle_location(self):
query = """query SearchSuggestions($searchInput: SearchSuggestionsInput!) {
search_suggestions(search_input: $searchInput) {
geo_results {
type
text
geo {
_id
area_type
city
state_code
postal_code
county
centroid { lat lon }
slug_id
geo_id
}
}
}
}"""
variables = { variables = {
"searchInput": { "searchInput": {
"search_term": self.location "search_term": self.location
} }
} }
response_json = self._graphql_post(query, variables, "SearchSuggestions") response_json = self._graphql_post(SEARCH_SUGGESTIONS_QUERY, variables, "Search_suggestions")
if ( if (
response_json is None response_json is None
@@ -130,6 +130,11 @@ class RealtorScraper(Scraper):
} }
if geo.get("area_type") == "address": if geo.get("area_type") == "address":
# Try to get mpr_id directly from API response first
if geo.get("mpr_id"):
result["mpr_id"] = geo.get("mpr_id")
else:
# Fallback: extract from _id field if it has addr: prefix
geo_id = geo.get("_id", "") geo_id = geo.get("_id", "")
if geo_id.startswith("addr:"): if geo_id.startswith("addr:"):
result["mpr_id"] = geo_id.replace("addr:", "") result["mpr_id"] = geo_id.replace("addr:", "")
@@ -137,12 +142,16 @@ class RealtorScraper(Scraper):
return result return result
def get_latest_listing_id(self, property_id: str) -> str | None: def get_latest_listing_id(self, property_id: str) -> str | None:
query = """query GetPropertyListingId($property_id: ID!) { query = """
property(id: $property_id) { fragment ListingFragment on Listing {
listings {
listing_id listing_id
primary primary
} }
query GetPropertyListingId($property_id: ID!) {
property(id: $property_id) {
listings {
...ListingFragment
}
} }
} }
""" """
@@ -399,7 +408,7 @@ class RealtorScraper(Scraper):
$radius: String! $radius: String!
$offset: Int!, $offset: Int!,
) { ) {
home_search( homeSearch: home_search(
query: { query: {
%s %s
nearby: { nearby: {
@@ -429,9 +438,9 @@ class RealtorScraper(Scraper):
elif search_type == "area": #: general search, came from a general location elif search_type == "area": #: general search, came from a general location
query = """query GetHomeSearch( query = """query GetHomeSearch(
$search_location: SearchLocation, $search_location: SearchLocation,
$offset: Int, $offset: Int
) { ) {
home_search( homeSearch: home_search(
query: { query: {
%s %s
search_location: $search_location search_location: $search_location
@@ -463,7 +472,7 @@ class RealtorScraper(Scraper):
$property_id: [ID]! $property_id: [ID]!
$offset: Int!, $offset: Int!,
) { ) {
home_search( homeSearch: home_search(
query: { query: {
property_id: $property_id property_id: $property_id
} }
@@ -475,7 +484,7 @@ class RealtorScraper(Scraper):
) )
response_json = self._graphql_post(query, variables, "GetHomeSearch") response_json = self._graphql_post(query, variables, "GetHomeSearch")
search_key = "home_search" if "home_search" in query else "property_search" search_key = "homeSearch"
properties: list[Union[Property, dict]] = [] properties: list[Union[Property, dict]] = []
@@ -1095,7 +1104,7 @@ class RealtorScraper(Scraper):
@retry( @retry(
retry=retry_if_exception_type((JSONDecodeError, Exception)), retry=retry_if_exception_type((JSONDecodeError, Exception)) & retry_if_not_exception_type(AuthenticationError),
wait=wait_exponential(multiplier=1, min=1, max=10), wait=wait_exponential(multiplier=1, min=1, max=10),
stop=stop_after_attempt(3), stop=stop_after_attempt(3),
) )
@@ -1109,20 +1118,17 @@ class RealtorScraper(Scraper):
property_ids = list(set(property_ids)) property_ids = list(set(property_ids))
# Construct the bulk query
fragments = "\n".join( fragments = "\n".join(
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}' f'home_{property_id}: home(property_id: {property_id}) {HOMES_DATA}'
for property_id in property_ids for property_id in property_ids
) )
query = f"""{HOME_FRAGMENT} query = f"""query GetHome {{
query GetBulkPropertyDetails {{
{fragments} {fragments}
}}""" }}"""
data = self._graphql_post(query, {}, "GetBulkPropertyDetails") data = self._graphql_post(query, {}, "GetHome")
if "data" not in data: if "data" not in data or data["data"] is None:
# If we got a 400 error with "Required parameter is missing", raise to trigger retry # If we got a 400 error with "Required parameter is missing", raise to trigger retry
if data and "errors" in data: if data and "errors" in data:
error_msgs = [e.get("message", "") for e in data.get("errors", [])] error_msgs = [e.get("message", "") for e in data.get("errors", [])]
@@ -1131,6 +1137,6 @@ query GetBulkPropertyDetails {{
return {} return {}
properties = data["data"] properties = data["data"]
return {data.replace('home_', ''): properties[data] for data in properties if properties[data]} return {key.replace('home_', ''): properties[key] for key in properties if properties[key]}

View File

@@ -1,3 +1,193 @@
SEARCH_RESULTS_FRAGMENT = """
fragment PropertyResult on SearchHome {
__typename
pending_date
listing_id
property_id
href
permalink
list_date
status
mls_status
last_sold_price
last_sold_date
last_status_change_date
last_update_date
list_price
list_price_max
list_price_min
price_per_sqft
tags
open_houses {
start_date
end_date
description
time_zone
dst
href
methods
}
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
photos(https: true) {
title
href
tags {
label
}
}
list_price
__typename
}
flags {
is_contingent
is_pending
is_new_construction
}
description {
type
sqft
beds
baths_full
baths_half
lot_sqft
year_built
garage
type
name
stories
text
}
source {
id
listing_id
}
hoa {
fee
}
location {
address {
street_direction
street_number
street_name
street_suffix
line
unit
city
state_code
postal_code
coordinate {
lon
lat
}
}
county {
name
fips_code
}
neighborhoods {
name
}
}
tax_record {
cl_id
public_record_id
last_update_date
apn
tax_parcel_id
}
primary_photo(https: true) {
href
}
advertisers {
email
broker {
name
fulfillment_id
}
type
name
fulfillment_id
builder {
name
fulfillment_id
}
phones {
ext
primary
type
number
}
office {
name
email
fulfillment_id
href
phones {
number
type
primary
ext
}
mls_set
}
corporation {
specialties
name
bio
href
fulfillment_id
}
mls_set
nrds_id
state_license
rental_corporation {
fulfillment_id
}
rental_management {
name
href
fulfillment_id
}
}
current_estimates {
__typename
source {
__typename
type
name
}
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
"""
_SEARCH_HOMES_DATA_BASE = """{ _SEARCH_HOMES_DATA_BASE = """{
pending_date pending_date
listing_id listing_id
@@ -181,8 +371,189 @@ _SEARCH_HOMES_DATA_BASE = """{
HOME_FRAGMENT = """ HOME_FRAGMENT = """
fragment HomeData on Home { fragment PropertyResult on Home {
__typename
pending_date
listing_id
property_id property_id
href
permalink
list_date
status
mls_status
last_sold_price
last_sold_date
last_status_change_date
last_update_date
list_price
list_price_max
list_price_min
price_per_sqft
tags
open_houses {
start_date
end_date
description
time_zone
dst
href
methods
}
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
photos(https: true) {
title
href
tags {
label
}
}
list_price
__typename
}
flags {
is_contingent
is_pending
is_new_construction
}
description {
type
sqft
beds
baths_full
baths_half
lot_sqft
year_built
garage
type
name
stories
text
}
source {
id
listing_id
}
hoa {
fee
}
location {
address {
street_direction
street_number
street_name
street_suffix
line
unit
city
state_code
postal_code
coordinate {
lon
lat
}
}
county {
name
fips_code
}
neighborhoods {
name
}
parcel {
parcel_id
}
}
tax_record {
cl_id
public_record_id
last_update_date
apn
tax_parcel_id
}
primary_photo(https: true) {
href
}
photos(https: true) {
title
href
tags {
label
}
}
advertisers {
email
broker {
name
fulfillment_id
}
type
name
fulfillment_id
builder {
name
fulfillment_id
}
phones {
ext
primary
type
number
}
office {
name
email
fulfillment_id
href
phones {
number
type
primary
ext
}
mls_set
}
corporation {
specialties
name
bio
href
fulfillment_id
}
mls_set
nrds_id
state_license
rental_corporation {
fulfillment_id
}
rental_management {
name
href
fulfillment_id
}
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } } __typename schools { district { __typename id name } }
} }
@@ -198,11 +569,6 @@ fragment HomeData on Home {
last_n_days last_n_days
} }
} }
location {
parcel {
parcel_id
}
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } } taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
property_history { property_history {
date date
@@ -227,6 +593,18 @@ fragment HomeData on Home {
text text
category category
} }
estimates {
__typename
currentValues: current_values {
__typename
source { __typename type name }
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
} }
""" """
@@ -300,8 +678,128 @@ current_estimates {
} }
}""" % _SEARCH_HOMES_DATA_BASE }""" % _SEARCH_HOMES_DATA_BASE
GENERAL_RESULTS_QUERY = """{ # Query body using inline fields (kept for backward compatibility)
GENERAL_RESULTS_QUERY_BODY = """{
count count
total total
results %s results %s
}""" % SEARCH_HOMES_DATA }""" % SEARCH_HOMES_DATA
GENERAL_RESULTS_QUERY = """{
__typename
count
total
results %s
}""" % SEARCH_HOMES_DATA
LISTING_PHOTOS_FRAGMENT = """
fragment ListingPhotosFragment on SearchHome {
__typename
photos(https: true) {
__typename
title
href
tags {
__typename
label
probability
}
}
}
"""
SEARCH_SUGGESTIONS_QUERY = """query Search_suggestions($searchInput: SearchSuggestionsInput!) {
search_suggestions(search_input: $searchInput) {
raw_input_parser_result
typeahead_results {
display_string
display_geo
geo {
_id
_score
mpr_id
area_type
city
state_code
state
postal_code
country
lat
lon
county
counties {
name
fips
state_code
}
slug_id
geo_id
score
name
city_slug_id
centroid {
lat
lon
}
county_needed_for_uniq
street
line
school
school_id
school_district
has_catchment
university
university_id
neighborhood
park
}
url
}
geo_results {
type
text
geo {
_id
_score
mpr_id
area_type
city
state_code
state
postal_code
country
lat
lon
county
counties {
name
fips
state_code
}
slug_id
geo_id
score
name
city_slug_id
centroid {
lat
lon
}
county_needed_for_uniq
street
line
school
school_id
school_district
has_catchment
university
university_id
neighborhood
park
}
}
no_matches
has_results
original_string
}
}"""

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.7" version = "0.8.18"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"