Merge pull request #145 from ZacharyHampton/fix/realtor-403-error

Fix 403 error from Realtor.com API changes
- version bump
2026-03-05 03:54:29 -08:00 · 2025-12-04 23:10:32 -08:00 · 2025-12-04 23:08:37 -08:00 · 2025-12-04 21:08:01 -08:00 · 2025-12-04 18:56:10 -08:00 · 2025-11-14 13:38:48 -08:00
6 changed files with 206 additions and 108 deletions
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ properties = scrape_property(
 #### Sorting & Listing Types
 ```py
 # Sort options: list_price, list_date, sqft, beds, baths, last_update_date
-# Listing types: "for_sale", "for_rent", "sold", "pending", list, or None (all)
+# Listing types: "for_sale", "for_rent", "sold", "pending", "off_market", list, or None (common types)
 properties = scrape_property(
    location="Miami, FL",
    listing_type=["for_sale", "pending"],  # Single string, list, or None
@@ -94,6 +94,17 @@ properties = scrape_property(
 )
 ```

+#### Pagination Control
+```py
+# Sequential mode with early termination (more efficient for narrow filters)
+properties = scrape_property(
+    location="Los Angeles, CA",
+    listing_type="for_sale",
+    updated_in_past_hours=2,  # Narrow time window
+    parallel=False  # Fetch pages sequentially, stop when filters no longer match
+)
+```
+
 ## Output
 ```plaintext
 >>> properties.head()
@@ -147,7 +158,7 @@ Required
 │    - 'other'
 │    - 'ready_to_build'
 │    - List of strings returns properties matching ANY status: ['for_sale', 'pending']
-│    - None returns all listing types
+│    - None returns common listing types (for_sale, for_rent, sold, pending, off_market)
 │
 Optional
 ├── property_type (list): Choose the type of properties.
@@ -234,7 +245,9 @@ Optional
 │
 ├── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
 │
-└── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
+├── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
+│
+└── parallel (True/False): Controls pagination strategy. Default is True (fetch pages in parallel for speed). Set to False for sequential fetching with early termination (useful for rate limiting or narrow time windows).
 ```

 ### Property Schema
--- a/homeharvest/init.py
+++ b/homeharvest/init.py
@@ -48,6 +48,8 @@ def scrape_property(
    # New sorting parameters
    sort_by: str = None,
    sort_direction: str = "desc",
+    # Pagination control
+    parallel: bool = True,
 ) -> Union[pd.DataFrame, list[dict], list[Property]]:
    """
    Scrape properties from Realtor.com based on a given location and listing type.
@@ -96,6 +98,9 @@ def scrape_property(
    :param year_built_min, year_built_max: Filter by year built
    :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
    :param sort_direction: Sort direction (asc, desc)
+    :param parallel: Controls pagination strategy. True (default) = fetch all pages in parallel for maximum speed.
+        False = fetch pages sequentially with early termination checks (useful for rate limiting or narrow time windows).
+        Sequential mode will stop paginating as soon as time-based filters indicate no more matches are possible.

    Note: past_days and past_hours also accept timedelta objects for more Pythonic usage.
    """
@@ -190,6 +195,8 @@ def scrape_property(
        # New sorting
        sort_by=sort_by,
        sort_direction=sort_direction,
+        # Pagination control
+        parallel=parallel,
    )

    site = RealtorScraper(scraper_input)
--- a/homeharvest/core/scrapers/init.py
+++ b/homeharvest/core/scrapers/init.py
@@ -55,6 +55,9 @@ class ScraperInput(BaseModel):
    sort_by: str | None = None
    sort_direction: str = "desc"

+    # Pagination control
+    parallel: bool = True
+

 class Scraper:
    session = None
@@ -78,21 +81,21 @@ class Scraper:
            Scraper.session.mount("https://", adapter)
            Scraper.session.headers.update(
                {
-                    "accept": "application/json, text/javascript",
-                    "accept-language": "en-US,en;q=0.9",
-                    "cache-control": "no-cache",
-                    "content-type": "application/json",
-                    "origin": "https://www.realtor.com",
-                    "pragma": "no-cache",
-                    "priority": "u=1, i",
-                    "rdc-ab-tests": "commute_travel_time_variation:v1",
-                    "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
-                    "sec-ch-ua-mobile": "?0",
-                    "sec-ch-ua-platform": '"Windows"',
-                    "sec-fetch-dest": "empty",
-                    "sec-fetch-mode": "cors",
-                    "sec-fetch-site": "same-origin",
-                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+                    'sec-ch-ua-platform': '"macOS"',
+                    'rdc-client-name': 'rdc-search-for-sale-desktop',
+                    'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
+                    'sec-ch-ua-mobile': '?0',
+                    'rdc-client-version': '0.1.0',
+                    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
+                    'accept': 'application/json',
+                    'content-type': 'application/json',
+                    'origin': 'https://www.realtor.com',
+                    'sec-fetch-site': 'same-site',
+                    'sec-fetch-mode': 'cors',
+                    'sec-fetch-dest': 'empty',
+                    'referer': 'https://www.realtor.com/',
+                    'accept-language': 'en-US,en;q=0.9',
+                    'priority': 'u=1, i',
                }
            )

@@ -141,6 +144,9 @@ class Scraper:
        self.sort_by = scraper_input.sort_by
        self.sort_direction = scraper_input.sort_direction

+        # Pagination control
+        self.parallel = scraper_input.parallel
+
    def search(self) -> list[Union[Property | dict]]: ...

    @staticmethod
--- a/homeharvest/core/scrapers/realtor/init.py
+++ b/homeharvest/core/scrapers/realtor/init.py
@@ -35,10 +35,7 @@ from .processors import (


 class RealtorScraper(Scraper):
-    SEARCH_GQL_URL = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
-    PROPERTY_URL = "https://www.realtor.com/realestateandhomes-detail/"
-    PROPERTY_GQL = "https://graph.realtor.com/graphql"
-    ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
+    SEARCH_GQL_URL = "https://api.frontdoor.realtor.com/graphql"
    NUM_PROPERTY_WORKERS = 20
    DEFAULT_PAGE_SIZE = 200

@@ -46,33 +43,70 @@ class RealtorScraper(Scraper):
        super().__init__(scraper_input)

    def handle_location(self):
-        # Get client_id from listing_type
-        if self.listing_type is None:
-            client_id = "for-sale"
-        elif isinstance(self.listing_type, list):
-            client_id = self.listing_type[0].value.lower().replace("_", "-") if self.listing_type else "for-sale"
-        else:
-            client_id = self.listing_type.value.lower().replace("_", "-")
+        query = """query Search_suggestions($searchInput: SearchSuggestionsInput!) {
+            search_suggestions(search_input: $searchInput) {
+                geo_results {
+                    type
+                    text
+                    geo {
+                        _id
+                        area_type
+                        city
+                        state_code
+                        postal_code
+                        county
+                        centroid { lat lon }
+                        slug_id
+                        geo_id
+                    }
+                }
+            }
+        }"""

-        params = {
-            "input": self.location,
-            "client_id": client_id,
-            "limit": "1",
-            "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
+        variables = {
+            "searchInput": {
+                "search_term": self.location
+            }
        }

-        response = self.session.get(
-            self.ADDRESS_AUTOCOMPLETE_URL,
-            params=params,
-        )
+        payload = {
+            "query": query,
+            "variables": variables,
+        }
+
+        response = self.session.post(self.SEARCH_GQL_URL, json=payload)
        response_json = response.json()

-        result = response_json["autocomplete"]
-
-        if not result:
+        if (
+            response_json is None
+            or "data" not in response_json
+            or response_json["data"] is None
+            or "search_suggestions" not in response_json["data"]
+            or response_json["data"]["search_suggestions"] is None
+            or "geo_results" not in response_json["data"]["search_suggestions"]
+            or not response_json["data"]["search_suggestions"]["geo_results"]
+        ):
            return None

-        return result[0]
+        geo_result = response_json["data"]["search_suggestions"]["geo_results"][0]
+        geo = geo_result.get("geo", {})
+
+        result = {
+            "text": geo_result.get("text"),
+            "area_type": geo.get("area_type"),
+            "city": geo.get("city"),
+            "state_code": geo.get("state_code"),
+            "postal_code": geo.get("postal_code"),
+            "county": geo.get("county"),
+            "centroid": geo.get("centroid"),
+        }
+
+        if geo.get("area_type") == "address":
+            geo_id = geo.get("_id", "")
+            if geo_id.startswith("addr:"):
+                result["mpr_id"] = geo_id.replace("addr:", "")
+
+        return result

    def get_latest_listing_id(self, property_id: str) -> str | None:
        query = """query Property($property_id: ID!) {
@@ -108,6 +142,7 @@ class RealtorScraper(Scraper):
            return property_info["listings"][0]["listing_id"]

    def handle_home(self, property_id: str) -> list[Property]:
+        """Fetch single home with proper error handling."""
        query = (
            """query Home($property_id: ID!) {
                    home(property_id: $property_id) %s
@@ -116,23 +151,33 @@ class RealtorScraper(Scraper):
        )

        variables = {"property_id": property_id}
-        payload = {
-            "query": query,
-            "variables": variables,
-        }
+        payload = {"query": query, "variables": variables}

-        response = self.session.post(self.SEARCH_GQL_URL, json=payload)
-        response_json = response.json()
+        try:
+            response = self.session.post(self.SEARCH_GQL_URL, json=payload)
+            data = response.json()

-        property_info = response_json["data"]["home"]
+            # Check for errors or missing data
+            if "errors" in data or "data" not in data:
+                return []

-        if self.return_type != ReturnType.raw:
-            return [process_property(property_info, self.mls_only, self.extra_property_data, 
-                                   self.exclude_pending, self.listing_type, get_key, process_extra_property_details)]
-        else:
-            return [property_info]
+            if data["data"] is None or "home" not in data["data"]:
+                return []

+            property_info = data["data"]["home"]
+            if property_info is None:
+                return []

+            # Process based on return type
+            if self.return_type != ReturnType.raw:
+                return [process_property(property_info, self.mls_only, self.extra_property_data,
+                                       self.exclude_pending, self.listing_type, get_key,
+                                       process_extra_property_details)]
+            else:
+                return [property_info]
+
+        except Exception:
+            return []

    def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]:
        """
@@ -144,7 +189,15 @@ class RealtorScraper(Scraper):
        # Determine date field based on listing type
        # Convert listing_type to list for uniform handling
        if self.listing_type is None:
-            listing_types = []
+            # When None, return all common listing types as documented
+            # Note: NEW_COMMUNITY, OTHER, and READY_TO_BUILD are excluded as they typically return no results
+            listing_types = [
+                ListingType.FOR_SALE,
+                ListingType.FOR_RENT,
+                ListingType.SOLD,
+                ListingType.PENDING,
+                ListingType.OFF_MARKET,
+            ]
            date_field = None  # When no listing_type is specified, skip date filtering
        elif isinstance(self.listing_type, list):
            listing_types = self.listing_type
@@ -277,10 +330,14 @@ class RealtorScraper(Scraper):
        else:
            sort_param = ""  #: prioritize normal fractal sort from realtor

-        # Handle PENDING with or_filters (applies if PENDING is in the list or is the single type)
+        # Handle PENDING with or_filters
+        # Only use or_filters when PENDING is the only type or mixed only with FOR_SALE
+        # Using or_filters with other types (SOLD, FOR_RENT, etc.) will exclude those types
        has_pending = ListingType.PENDING in listing_types
+        other_types = [lt for lt in listing_types if lt not in [ListingType.PENDING, ListingType.FOR_SALE]]
+        use_or_filters = has_pending and len(other_types) == 0
        pending_or_contingent_param = (
-            "or_filters: { contingent: true, pending: true }" if has_pending else ""
+            "or_filters: { contingent: true, pending: true }" if use_or_filters else ""
        )

        # Build bucket parameter (only use fractal sort if no custom sort is specified)
@@ -351,19 +408,13 @@ class RealtorScraper(Scraper):
            )
        elif search_type == "area":  #: general search, came from a general location
            query = """query Home_search(
-                                $city: String,
-                                $county: [String],
-                                $state_code: String,
-                                $postal_code: String
+                                $search_location: SearchLocation,
                                $offset: Int,
                            ) {
                                home_search(
                                    query: {
                                        %s
-                                        city: $city
-                                        county: $county
-                                        postal_code: $postal_code
-                                        state_code: $state_code
+                                        search_location: $search_location
                                        %s
                                        %s
                                        %s
@@ -499,24 +550,16 @@ class RealtorScraper(Scraper):
                if not location_info.get("centroid"):
                    return []

-                coordinates = list(location_info["centroid"].values())
+                centroid = location_info["centroid"]
+                coordinates = [centroid["lon"], centroid["lat"]]  # GeoJSON order: [lon, lat]
                search_variables |= {
                    "coordinates": coordinates,
                    "radius": "{}mi".format(self.radius),
                }

-        elif location_type == "postal_code":
+        else:  #: general search (city, county, postal_code, etc.)
            search_variables |= {
-                "postal_code": location_info.get("postal_code"),
-            }
-
-        else:  #: general search, location
-            search_variables |= {
-                "city": location_info.get("city"),
-                "county": location_info.get("county"),
-                "state_code": location_info.get("state_code"),
-                "postal_code": location_info.get("postal_code"),
-
+                "search_location": {"location": location_info.get("text")},
            }

        if self.foreclosure:
@@ -526,39 +569,49 @@ class RealtorScraper(Scraper):
        total = result["total"]
        homes = result["properties"]

-        # Pre-check: Should we continue pagination?
-        # This optimization prevents unnecessary API calls when using time-based filters
-        # with date sorting. If page 1's last property is outside the time window,
-        # all future pages will also be outside (due to sort order).
-        should_continue_pagination = self._should_fetch_more_pages(homes)
+        # Fetch remaining pages based on parallel parameter
+        if self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
+            if self.parallel:
+                # Parallel mode: Fetch all remaining pages in parallel
+                with ThreadPoolExecutor() as executor:
+                    futures_with_offsets = [
+                        (i, executor.submit(
+                            self.general_search,
+                            variables=search_variables | {"offset": i},
+                            search_type=search_type,
+                        ))
+                        for i in range(
+                            self.offset + self.DEFAULT_PAGE_SIZE,
+                            min(total, self.offset + self.limit),
+                            self.DEFAULT_PAGE_SIZE,
+                        )
+                    ]

-        # Only launch parallel pagination if needed
-        if should_continue_pagination and self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
-            with ThreadPoolExecutor() as executor:
-                # Store futures with their offsets to maintain proper sort order
-                # Start from offset + page_size and go up to offset + limit
-                futures_with_offsets = [
-                    (i, executor.submit(
-                        self.general_search,
-                        variables=search_variables | {"offset": i},
+                    # Collect results and sort by offset to preserve API sort order
+                    results = []
+                    for offset, future in futures_with_offsets:
+                        results.append((offset, future.result()["properties"]))
+
+                    results.sort(key=lambda x: x[0])
+                    for offset, properties in results:
+                        homes.extend(properties)
+            else:
+                # Sequential mode: Fetch pages one by one with early termination checks
+                for current_offset in range(
+                    self.offset + self.DEFAULT_PAGE_SIZE,
+                    min(total, self.offset + self.limit),
+                    self.DEFAULT_PAGE_SIZE,
+                ):
+                    # Check if we should continue based on time-based filters
+                    if not self._should_fetch_more_pages(homes):
+                        break
+
+                    result = self.general_search(
+                        variables=search_variables | {"offset": current_offset},
                        search_type=search_type,
-                    ))
-                    for i in range(
-                        self.offset + self.DEFAULT_PAGE_SIZE,
-                        min(total, self.offset + self.limit),
-                        self.DEFAULT_PAGE_SIZE,
                    )
-                ]
-
-                # Collect results and sort by offset to preserve API sort order across pages
-                results = []
-                for offset, future in futures_with_offsets:
-                    results.append((offset, future.result()["properties"]))
-
-                # Sort by offset and concatenate in correct order
-                results.sort(key=lambda x: x[0])
-                for offset, properties in results:
-                    homes.extend(properties)
+                    page_properties = result["properties"]
+                    homes.extend(page_properties)

        # Apply client-side hour-based filtering if needed
        # (API only supports day-level filtering, so we post-filter for hour precision)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "homeharvest"
-version = "0.8.3"
+version = "0.8.6b"
 description = "Real estate scraping library"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/ZacharyHampton/HomeHarvest"
--- a/tests/test_realtor.py
+++ b/tests/test_realtor.py
@@ -87,6 +87,25 @@ def test_realtor_date_range_sold():
    )


+def test_listing_type_none_includes_sold():
+    """Test that listing_type=None includes sold listings (issue #142)"""
+    # Get properties with listing_type=None (should include all common types)
+    result_none = scrape_property(
+        location="Warren, MI",
+        listing_type=None
+    )
+
+    # Verify we got results
+    assert result_none is not None and len(result_none) > 0
+
+    # Verify sold listings are included
+    status_types = set(result_none['status'].unique())
+    assert 'SOLD' in status_types, "SOLD listings should be included when listing_type=None"
+
+    # Verify we get multiple listing types (not just one)
+    assert len(status_types) > 1, "Should return multiple listing types when listing_type=None"
+
+
 def test_realtor_single_property():
    results = [
        scrape_property(
Author	SHA1	Message	Date
Zachary Hampton	57093f5d17	Merge pull request #145 from ZacharyHampton/fix/realtor-403-error Fix 403 error from Realtor.com API changes	2025-12-04 23:10:32 -08:00
zacharyhampton	406ff97260	- version bump	2025-12-04 23:08:37 -08:00
zacharyhampton	a8c9d0fd66	Replace REST autocomplete with GraphQL Search_suggestions query - Replace /suggest REST endpoint with GraphQL Search_suggestions query - Use search_location field instead of individual city/county/state/postal_code fields - Fix coordinate order to [lon, lat] (GeoJSON standard) for radius searches - Extract mpr_id from addr: prefix for single address lookups 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-12-04 21:08:01 -08:00
Zachary Hampton	0b283e18bd	Fix 403 error from Realtor.com API changes - Update GraphQL endpoint to api.frontdoor.realtor.com - Update HTTP headers with newer Chrome version and correct client name/version - Improve error handling in handle_home method - Fix response validation for missing/null data 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-12-04 18:56:10 -08:00
Zachary Hampton	8bf1f9e24b	Add regression test for listing_type=None including sold listings Adds test_listing_type_none_includes_sold() to verify that when listing_type=None, sold listings are included in the results. This prevents regression of issue #142. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-11-14 13:38:48 -08:00
Zachary Hampton	79b2b648f5	Fix sold listings not included when listing_type=None (issue #142 ) When listing_type=None, sold listings were excluded despite documentation stating all types should be returned. This fix includes two changes: 1. Explicitly include common listing types (for_sale, for_rent, sold, pending, off_market) when listing_type=None instead of sending empty status parameter 2. Fix or_filters logic to only apply for PENDING when not mixed with other types like SOLD, preventing unintended filtering Updated README documentation to accurately reflect that None returns common listing types rather than all 8 types. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-11-14 13:30:54 -08:00
Zachary Hampton	c2f01df1ad	Add configurable parallel/sequential pagination with `parallel` parameter - Add `parallel: bool = True` parameter to control pagination strategy - Parallel mode (default): Fetches all pages in parallel for maximum speed - Sequential mode: Fetches pages one-by-one with early termination checks - Early termination stops pagination when time-based filters indicate no more matches - Useful for rate limiting and narrow time windows - Simplified pagination logic by removing hybrid first-page pre-check - Updated README with usage example and parameter documentation - Version bump to 0.8.4 - All 54 tests passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-11-13 10:36:47 -08:00