Compare commits

...

3 Commits

Author SHA1 Message Date
zacharyhampton
9a0cac650e Version bump to 0.8.16 2025-12-21 16:22:03 -07:00
zacharyhampton
a1c1bcc822 Version bump to 0.8.15
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 16:03:57 -07:00
zacharyhampton
6f3faceb27 Version bump to 0.8.14
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 14:32:59 -07:00
4 changed files with 128 additions and 28 deletions

View File

@@ -5,7 +5,6 @@ import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import uuid import uuid
import secrets
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
import json import json
@@ -83,15 +82,19 @@ class Scraper:
Scraper.session.headers.update( Scraper.session.headers.update(
{ {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'apollographql-client-version': '26.11.1-26.11.1.1106489',
'Accept': '*/*', 'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9', 'Accept-Language': 'en-US,en;q=0.9',
'rdc-client-version': '26.11.1', 'Cache-Control': 'no-cache',
'X-APOLLO-OPERATION-TYPE': 'query', 'Pragma': 'no-cache',
'X-APOLLO-OPERATION-ID': secrets.token_hex(32), 'rdc-client-name': 'rdc-home',
'rdc-client-name': 'RDC_NATIVE_MOBILE-iPhone-com.move.Realtor', 'rdc-client-version': '2.68.0',
'apollographql-client-name': 'com.move.Realtor-apollo-ios', 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'User-Agent': 'Realtor.com/26.11.1.1106489 CFNetwork/3860.200.71 Darwin/25.1.0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
} }
) )

View File

@@ -17,6 +17,7 @@ from typing import Dict, Union
from tenacity import ( from tenacity import (
retry, retry,
retry_if_exception_type, retry_if_exception_type,
retry_if_not_exception_type,
wait_exponential, wait_exponential,
stop_after_attempt, stop_after_attempt,
) )
@@ -28,7 +29,7 @@ from ..models import (
ListingType, ListingType,
ReturnType ReturnType
) )
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT, MORPHEUS_SUGGESTIONS_QUERY from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT, SEARCH_SUGGESTIONS_QUERY
from .processors import ( from .processors import (
process_property, process_property,
process_extra_property_details, process_extra_property_details,
@@ -37,7 +38,7 @@ from .processors import (
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
SEARCH_GQL_URL = "https://api.frontdoor.realtor.com/graphql" SEARCH_GQL_URL = "https://www.realtor.com/frontdoor/graphql"
NUM_PROPERTY_WORKERS = 20 NUM_PROPERTY_WORKERS = 20
DEFAULT_PAGE_SIZE = 200 DEFAULT_PAGE_SIZE = 200
@@ -52,21 +53,18 @@ class RealtorScraper(Scraper):
def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict: def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict:
""" """
Execute a GraphQL query with operation-specific headers. Execute a GraphQL query.
Args: Args:
query: GraphQL query string (must include operationName matching operation_name param) query: GraphQL query string (must include operationName matching operation_name param)
variables: Query variables dictionary variables: Query variables dictionary
operation_name: Name of the GraphQL operation for Apollo headers operation_name: Name of the GraphQL operation
Returns: Returns:
Response JSON dictionary Response JSON dictionary
""" """
# Set operation-specific header (must match query's operationName)
self.session.headers['X-APOLLO-OPERATION-NAME'] = operation_name
payload = { payload = {
"operationName": operation_name, # Include in payload "operationName": operation_name,
"query": self._minify_query(query), "query": self._minify_query(query),
"variables": variables, "variables": variables,
} }
@@ -96,7 +94,7 @@ class RealtorScraper(Scraper):
} }
} }
response_json = self._graphql_post(MORPHEUS_SUGGESTIONS_QUERY, variables, "GetMorpheusSuggestions") response_json = self._graphql_post(SEARCH_SUGGESTIONS_QUERY, variables, "Search_suggestions")
if ( if (
response_json is None response_json is None
@@ -128,6 +126,11 @@ class RealtorScraper(Scraper):
} }
if geo.get("area_type") == "address": if geo.get("area_type") == "address":
# Try to get mpr_id directly from API response first
if geo.get("mpr_id"):
result["mpr_id"] = geo.get("mpr_id")
else:
# Fallback: extract from _id field if it has addr: prefix
geo_id = geo.get("_id", "") geo_id = geo.get("_id", "")
if geo_id.startswith("addr:"): if geo_id.startswith("addr:"):
result["mpr_id"] = geo_id.replace("addr:", "") result["mpr_id"] = geo_id.replace("addr:", "")
@@ -171,7 +174,7 @@ class RealtorScraper(Scraper):
"""%s """%s
query GetHomeDetails($property_id: ID!) { query GetHomeDetails($property_id: ID!) {
home(property_id: $property_id) { home(property_id: $property_id) {
...HomeDetailsFragment ...SearchFragment
} }
}""" }"""
% HOME_FRAGMENT % HOME_FRAGMENT
@@ -1110,7 +1113,7 @@ class RealtorScraper(Scraper):
@retry( @retry(
retry=retry_if_exception_type((JSONDecodeError, Exception)), retry=retry_if_exception_type((JSONDecodeError, Exception)) & retry_if_not_exception_type(AuthenticationError),
wait=wait_exponential(multiplier=1, min=1, max=10), wait=wait_exponential(multiplier=1, min=1, max=10),
stop=stop_after_attempt(3), stop=stop_after_attempt(3),
) )
@@ -1125,16 +1128,16 @@ class RealtorScraper(Scraper):
property_ids = list(set(property_ids)) property_ids = list(set(property_ids))
fragments = "\n".join( fragments = "\n".join(
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeDetailsFragment }}' f'home_{property_id}: home(property_id: {property_id}) {{ ...SearchFragment }}'
for property_id in property_ids for property_id in property_ids
) )
query = f"""{HOME_FRAGMENT} query = f"""{HOME_FRAGMENT}
query GetHomeDetails {{ query GetHome {{
{fragments} {fragments}
}}""" }}"""
data = self._graphql_post(query, {}, "GetHomeDetails") data = self._graphql_post(query, {}, "GetHome")
if "data" not in data or data["data"] is None: if "data" not in data or data["data"] is None:
# If we got a 400 error with "Required parameter is missing", raise to trigger retry # If we got a 400 error with "Required parameter is missing", raise to trigger retry

View File

@@ -371,7 +371,7 @@ _SEARCH_HOMES_DATA_BASE = """{
HOME_FRAGMENT = """ HOME_FRAGMENT = """
fragment HomeDetailsFragment on Home { fragment SearchFragment on Home {
__typename __typename
pending_date pending_date
listing_id listing_id
@@ -712,4 +712,98 @@ fragment ListingPhotosFragment on SearchHome {
} }
""" """
MORPHEUS_SUGGESTIONS_QUERY = """query GetMorpheusSuggestions($searchInput: SearchSuggestionsInput!) { search_suggestions(search_input: $searchInput) { __typename geo_results { __typename type text geo { __typename _id _score mpr_id area_type city state_code postal_code country lat lon county counties { __typename name fips state_code } slug_id geo_id score name city_slug_id centroid { __typename lat lon } county_needed_for_uniq street line school school_id school_district school_district_id has_catchment university university_id neighborhood park } } no_matches has_results filter_criteria { __typename property_type { __typename type } price { __typename min max pattern } bed { __typename min max pattern } bath { __typename min max pattern } feature_tags { __typename tags } listing_status { __typename new_construction existing_homes foreclosures recently_sold fifty_five_plus open_house hide_new_construction hide_existing_homes hide_foreclosures hide_recently_sold hide_fifty_five_plus hide_open_house virtual_tour three_d_tour contingent hide_contingent pending hide_pending } keyword { __typename keywords } garage { __typename min max pattern } age { __typename min max pattern } stories { __typename min max pattern } lot_size { __typename min max pattern } square_feet { __typename min max pattern } home_size { __typename min max pattern } basement finished_basement pool waterfront fireplace detached_garage expand { __typename radius } hoa { __typename type fee } } message_data { __typename property_type pool waterfront fireplace basement finished_basement detached_garage listing_status { __typename new_construction existing_homes foreclosures recently_sold fifty_five_plus open_house hide_new_construction hide_existing_homes hide_foreclosures hide_recently_sold hide_fifty_five_plus hide_open_house } keywords price { __typename min max pattern } bed { __typename min max pattern } bath { __typename min max pattern } garage { __typename min max pattern } stories { __typename min max pattern } age { __typename min max pattern } lot_size { __typename min max pattern } square_feet { __typename min max pattern } } original_string morpheus_context } }""" SEARCH_SUGGESTIONS_QUERY = """query Search_suggestions($searchInput: SearchSuggestionsInput!) {
search_suggestions(search_input: $searchInput) {
raw_input_parser_result
typeahead_results {
display_string
display_geo
geo {
_id
_score
mpr_id
area_type
city
state_code
state
postal_code
country
lat
lon
county
counties {
name
fips
state_code
}
slug_id
geo_id
score
name
city_slug_id
centroid {
lat
lon
}
county_needed_for_uniq
street
line
school
school_id
school_district
has_catchment
university
university_id
neighborhood
park
}
url
}
geo_results {
type
text
geo {
_id
_score
mpr_id
area_type
city
state_code
state
postal_code
country
lat
lon
county
counties {
name
fips
state_code
}
slug_id
geo_id
score
name
city_slug_id
centroid {
lat
lon
}
county_needed_for_uniq
street
line
school
school_id
school_district
has_catchment
university
university_id
neighborhood
park
}
}
no_matches
has_results
original_string
}
}"""

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.13" version = "0.8.16"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"