Compare commits

..

4 Commits

Author SHA1 Message Date
zacharyhampton
cab0216f29 Version bump to 0.8.13
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 12:30:46 -07:00
zacharyhampton
8ee720ce5c Version bump to 0.8.12
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-20 15:30:26 -07:00
zacharyhampton
8eb138ee1a Version bump to 0.8.11
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 22:42:01 -07:00
Zachary Hampton
ef6db606fd Version bump to 0.8.10
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-16 18:32:33 -08:00
4 changed files with 68 additions and 59 deletions

View File

@@ -5,6 +5,7 @@ import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import uuid import uuid
import secrets
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
import json import json
@@ -73,7 +74,7 @@ class Scraper:
if not self.session: if not self.session:
Scraper.session = requests.Session() Scraper.session = requests.Session()
retries = Retry( retries = Retry(
total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"]) total=3, backoff_factor=4, status_forcelist=[429], allowed_methods=frozenset(["GET", "POST"])
) )
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20) adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
@@ -81,25 +82,22 @@ class Scraper:
Scraper.session.mount("https://", adapter) Scraper.session.mount("https://", adapter)
Scraper.session.headers.update( Scraper.session.headers.update(
{ {
'Host': 'api.frontdoor.realtor.com',
'rdc-ab-test-client': 'ios_for_sale',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'apollographql-client-version': '26.9.25-26.9.25.0774600', 'apollographql-client-version': '26.11.1-26.11.1.1106489',
'Accept': '*/*', 'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9', 'Accept-Language': 'en-US,en;q=0.9',
'rdc-client-version': '26.9.25', 'rdc-client-version': '26.11.1',
'X-APOLLO-OPERATION-TYPE': 'query', 'X-APOLLO-OPERATION-TYPE': 'query',
'X-APOLLO-OPERATION-ID': secrets.token_hex(32),
'rdc-client-name': 'RDC_NATIVE_MOBILE-iPhone-com.move.Realtor', 'rdc-client-name': 'RDC_NATIVE_MOBILE-iPhone-com.move.Realtor',
'apollographql-client-name': 'com.move.Realtor-apollo-ios', 'apollographql-client-name': 'com.move.Realtor-apollo-ios',
'newrelic': '', 'User-Agent': 'Realtor.com/26.11.1.1106489 CFNetwork/3860.200.71 Darwin/25.1.0',
'transparent': '',
'User-Agent': 'Realtor.com/26.9.25.0774600 CFNetwork/3860.200.71 Darwin/25.1.0',
} }
) )
if scraper_input.proxy: self.proxy = scraper_input.proxy
proxy_url = scraper_input.proxy if self.proxy:
proxies = {"http": proxy_url, "https": proxy_url} proxies = {"http": self.proxy, "https": self.proxy}
self.session.proxies.update(proxies) self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type

View File

@@ -8,6 +8,7 @@ This module implements the scraper for realtor.com
from __future__ import annotations from __future__ import annotations
import json import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
@@ -21,12 +22,13 @@ from tenacity import (
) )
from .. import Scraper from .. import Scraper
from ....exceptions import AuthenticationError
from ..models import ( from ..models import (
Property, Property,
ListingType, ListingType,
ReturnType ReturnType
) )
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT, SEARCH_RESULTS_FRAGMENT from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT, MORPHEUS_SUGGESTIONS_QUERY
from .processors import ( from .processors import (
process_property, process_property,
process_extra_property_details, process_extra_property_details,
@@ -42,6 +44,12 @@ class RealtorScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
@staticmethod
def _minify_query(query: str) -> str:
"""Minify GraphQL query by collapsing whitespace to single spaces."""
# Split on whitespace, filter empty strings, join with single space
return ' '.join(query.split())
def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict: def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict:
""" """
Execute a GraphQL query with operation-specific headers. Execute a GraphQL query with operation-specific headers.
@@ -59,11 +67,21 @@ class RealtorScraper(Scraper):
payload = { payload = {
"operationName": operation_name, # Include in payload "operationName": operation_name, # Include in payload
"query": query, "query": self._minify_query(query),
"variables": variables, "variables": variables,
} }
response = self.session.post(self.SEARCH_GQL_URL, json=payload) response = self.session.post(self.SEARCH_GQL_URL, data=json.dumps(payload, separators=(',', ':')))
if response.status_code == 403:
if not self.proxy:
raise AuthenticationError(
"Received 403 Forbidden from Realtor.com API.",
response=response
)
else:
raise Exception("Received 403 Forbidden, retrying...")
return response.json() return response.json()
@retry( @retry(
@@ -72,37 +90,13 @@ class RealtorScraper(Scraper):
stop=stop_after_attempt(3), stop=stop_after_attempt(3),
) )
def handle_location(self): def handle_location(self):
query = """
fragment SuggestionFragment on SearchSuggestionGeoResult {
type
text
geo {
_id
area_type
city
state_code
postal_code
county
centroid { lat lon }
slug_id
geo_id
}
}
query SearchSuggestions($searchInput: SearchSuggestionsInput!) {
search_suggestions(search_input: $searchInput) {
geo_results {
...SuggestionFragment
}
}
}"""
variables = { variables = {
"searchInput": { "searchInput": {
"search_term": self.location "search_term": self.location
} }
} }
response_json = self._graphql_post(query, variables, "SearchSuggestions") response_json = self._graphql_post(MORPHEUS_SUGGESTIONS_QUERY, variables, "GetMorpheusSuggestions")
if ( if (
response_json is None response_json is None
@@ -405,8 +399,7 @@ class RealtorScraper(Scraper):
is_foreclosure = "foreclosure: false" is_foreclosure = "foreclosure: false"
if search_type == "comps": #: comps search, came from an address if search_type == "comps": #: comps search, came from an address
query = """%s query = """query GetHomeSearch(
query GetHomeSearch(
$coordinates: [Float]! $coordinates: [Float]!
$radius: String! $radius: String!
$offset: Int!, $offset: Int!,
@@ -428,8 +421,9 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s ) %s
}""" % ( }
SEARCH_RESULTS_FRAGMENT, %s
%s""" % (
is_foreclosure, is_foreclosure,
status_param, status_param,
date_param, date_param,
@@ -438,10 +432,11 @@ class RealtorScraper(Scraper):
pending_or_contingent_param, pending_or_contingent_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
SEARCH_RESULTS_FRAGMENT,
LISTING_PHOTOS_FRAGMENT,
) )
elif search_type == "area": #: general search, came from a general location elif search_type == "area": #: general search, came from a general location
query = """%s query = """query GetHomeSearch(
query GetHomeSearch(
$search_location: SearchLocation, $search_location: SearchLocation,
$offset: Int, $offset: Int,
) { ) {
@@ -460,8 +455,9 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s ) %s
}""" % ( }
SEARCH_RESULTS_FRAGMENT, %s
%s""" % (
is_foreclosure, is_foreclosure,
status_param, status_param,
date_param, date_param,
@@ -471,11 +467,12 @@ class RealtorScraper(Scraper):
bucket_param, bucket_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
SEARCH_RESULTS_FRAGMENT,
LISTING_PHOTOS_FRAGMENT,
) )
else: #: general search, came from an address else: #: general search, came from an address
query = ( query = (
"""%s """query GetHomeSearch(
query GetHomeSearch(
$property_id: [ID]! $property_id: [ID]!
$offset: Int!, $offset: Int!,
) { ) {
@@ -486,8 +483,10 @@ class RealtorScraper(Scraper):
limit: 1 limit: 1
offset: $offset offset: $offset
) %s ) %s
}""" }
% (SEARCH_RESULTS_FRAGMENT, GENERAL_RESULTS_QUERY) %s
%s"""
% (GENERAL_RESULTS_QUERY, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT)
) )
response_json = self._graphql_post(query, variables, "GetHomeSearch") response_json = self._graphql_post(query, variables, "GetHomeSearch")

View File

@@ -121,13 +121,6 @@ fragment SearchFragment on SearchHome {
primary_photo(https: true) { primary_photo(https: true) {
href href
} }
photos(https: true) {
title
href
tags {
label
}
}
advertisers { advertisers {
email email
broker { broker {
@@ -699,5 +692,24 @@ GENERAL_RESULTS_QUERY = """{
results { results {
__typename __typename
...SearchFragment ...SearchFragment
...ListingPhotosFragment
} }
}""" }"""
LISTING_PHOTOS_FRAGMENT = """
fragment ListingPhotosFragment on SearchHome {
__typename
photos(https: true) {
__typename
title
href
tags {
__typename
label
probability
}
}
}
"""
MORPHEUS_SUGGESTIONS_QUERY = """query GetMorpheusSuggestions($searchInput: SearchSuggestionsInput!) { search_suggestions(search_input: $searchInput) { __typename geo_results { __typename type text geo { __typename _id _score mpr_id area_type city state_code postal_code country lat lon county counties { __typename name fips state_code } slug_id geo_id score name city_slug_id centroid { __typename lat lon } county_needed_for_uniq street line school school_id school_district school_district_id has_catchment university university_id neighborhood park } } no_matches has_results filter_criteria { __typename property_type { __typename type } price { __typename min max pattern } bed { __typename min max pattern } bath { __typename min max pattern } feature_tags { __typename tags } listing_status { __typename new_construction existing_homes foreclosures recently_sold fifty_five_plus open_house hide_new_construction hide_existing_homes hide_foreclosures hide_recently_sold hide_fifty_five_plus hide_open_house virtual_tour three_d_tour contingent hide_contingent pending hide_pending } keyword { __typename keywords } garage { __typename min max pattern } age { __typename min max pattern } stories { __typename min max pattern } lot_size { __typename min max pattern } square_feet { __typename min max pattern } home_size { __typename min max pattern } basement finished_basement pool waterfront fireplace detached_garage expand { __typename radius } hoa { __typename type fee } } message_data { __typename property_type pool waterfront fireplace basement finished_basement detached_garage listing_status { __typename new_construction existing_homes foreclosures recently_sold fifty_five_plus open_house hide_new_construction hide_existing_homes hide_foreclosures hide_recently_sold hide_fifty_five_plus hide_open_house } keywords price { __typename min max pattern } bed { __typename min max pattern } bath { __typename min max pattern } garage { __typename min max pattern } stories { __typename min max pattern } age { __typename min max pattern } lot_size { __typename min max pattern } square_feet { __typename min max pattern } } original_string morpheus_context } }"""

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.9" version = "0.8.13"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"