Fix sold listings not included when listing_type=None (issue #142 )

When listing_type=None, sold listings were excluded despite documentation stating all types should be returned. This fix includes two changes: 1. Explicitly include common listing types (for_sale, for_rent, sold, pending, off_market) when listing_type=None instead of sending empty status parameter 2. Fix or_filters logic to only apply for PENDING when not mixed with other types like SOLD, preventing unintended filtering Updated README documentation to accurately reflect that None returns common listing types rather than all 8 types. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
Add configurable parallel/sequential pagination with parallel parameter
2026-03-04 19:44:29 -08:00 · 2025-11-14 13:30:54 -08:00 · 2025-11-13 10:36:47 -08:00
5 changed files with 85 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ properties = scrape_property(
 #### Sorting & Listing Types
 ```py
 # Sort options: list_price, list_date, sqft, beds, baths, last_update_date
-# Listing types: "for_sale", "for_rent", "sold", "pending", list, or None (all)
+# Listing types: "for_sale", "for_rent", "sold", "pending", "off_market", list, or None (common types)
 properties = scrape_property(
    location="Miami, FL",
    listing_type=["for_sale", "pending"],  # Single string, list, or None
@@ -94,6 +94,17 @@ properties = scrape_property(
 )
 ```

+#### Pagination Control
+```py
+# Sequential mode with early termination (more efficient for narrow filters)
+properties = scrape_property(
+    location="Los Angeles, CA",
+    listing_type="for_sale",
+    updated_in_past_hours=2,  # Narrow time window
+    parallel=False  # Fetch pages sequentially, stop when filters no longer match
+)
+```
+
 ## Output
 ```plaintext
 >>> properties.head()
@@ -147,7 +158,7 @@ Required
 │    - 'other'
 │    - 'ready_to_build'
 │    - List of strings returns properties matching ANY status: ['for_sale', 'pending']
-│    - None returns all listing types
+│    - None returns common listing types (for_sale, for_rent, sold, pending, off_market)
 │
 Optional
 ├── property_type (list): Choose the type of properties.
@@ -234,7 +245,9 @@ Optional
 │
 ├── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
 │
-└── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
+├── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
+│
+└── parallel (True/False): Controls pagination strategy. Default is True (fetch pages in parallel for speed). Set to False for sequential fetching with early termination (useful for rate limiting or narrow time windows).
 ```

 ### Property Schema
--- a/homeharvest/init.py
+++ b/homeharvest/init.py
@@ -48,6 +48,8 @@ def scrape_property(
    # New sorting parameters
    sort_by: str = None,
    sort_direction: str = "desc",
+    # Pagination control
+    parallel: bool = True,
 ) -> Union[pd.DataFrame, list[dict], list[Property]]:
    """
    Scrape properties from Realtor.com based on a given location and listing type.
@@ -96,6 +98,9 @@ def scrape_property(
    :param year_built_min, year_built_max: Filter by year built
    :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
    :param sort_direction: Sort direction (asc, desc)
+    :param parallel: Controls pagination strategy. True (default) = fetch all pages in parallel for maximum speed.
+        False = fetch pages sequentially with early termination checks (useful for rate limiting or narrow time windows).
+        Sequential mode will stop paginating as soon as time-based filters indicate no more matches are possible.

    Note: past_days and past_hours also accept timedelta objects for more Pythonic usage.
    """
@@ -190,6 +195,8 @@ def scrape_property(
        # New sorting
        sort_by=sort_by,
        sort_direction=sort_direction,
+        # Pagination control
+        parallel=parallel,
    )

    site = RealtorScraper(scraper_input)
--- a/homeharvest/core/scrapers/init.py
+++ b/homeharvest/core/scrapers/init.py
@@ -55,6 +55,9 @@ class ScraperInput(BaseModel):
    sort_by: str | None = None
    sort_direction: str = "desc"

+    # Pagination control
+    parallel: bool = True
+

 class Scraper:
    session = None
@@ -141,6 +144,9 @@ class Scraper:
        self.sort_by = scraper_input.sort_by
        self.sort_direction = scraper_input.sort_direction

+        # Pagination control
+        self.parallel = scraper_input.parallel
+
    def search(self) -> list[Union[Property | dict]]: ...

    @staticmethod
--- a/homeharvest/core/scrapers/realtor/init.py
+++ b/homeharvest/core/scrapers/realtor/init.py
@@ -144,7 +144,15 @@ class RealtorScraper(Scraper):
        # Determine date field based on listing type
        # Convert listing_type to list for uniform handling
        if self.listing_type is None:
-            listing_types = []
+            # When None, return all common listing types as documented
+            # Note: NEW_COMMUNITY, OTHER, and READY_TO_BUILD are excluded as they typically return no results
+            listing_types = [
+                ListingType.FOR_SALE,
+                ListingType.FOR_RENT,
+                ListingType.SOLD,
+                ListingType.PENDING,
+                ListingType.OFF_MARKET,
+            ]
            date_field = None  # When no listing_type is specified, skip date filtering
        elif isinstance(self.listing_type, list):
            listing_types = self.listing_type
@@ -277,10 +285,14 @@ class RealtorScraper(Scraper):
        else:
            sort_param = ""  #: prioritize normal fractal sort from realtor

-        # Handle PENDING with or_filters (applies if PENDING is in the list or is the single type)
+        # Handle PENDING with or_filters
+        # Only use or_filters when PENDING is the only type or mixed only with FOR_SALE
+        # Using or_filters with other types (SOLD, FOR_RENT, etc.) will exclude those types
        has_pending = ListingType.PENDING in listing_types
+        other_types = [lt for lt in listing_types if lt not in [ListingType.PENDING, ListingType.FOR_SALE]]
+        use_or_filters = has_pending and len(other_types) == 0
        pending_or_contingent_param = (
-            "or_filters: { contingent: true, pending: true }" if has_pending else ""
+            "or_filters: { contingent: true, pending: true }" if use_or_filters else ""
        )

        # Build bucket parameter (only use fractal sort if no custom sort is specified)
@@ -526,39 +538,49 @@ class RealtorScraper(Scraper):
        total = result["total"]
        homes = result["properties"]

-        # Pre-check: Should we continue pagination?
-        # This optimization prevents unnecessary API calls when using time-based filters
-        # with date sorting. If page 1's last property is outside the time window,
-        # all future pages will also be outside (due to sort order).
-        should_continue_pagination = self._should_fetch_more_pages(homes)
+        # Fetch remaining pages based on parallel parameter
+        if self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
+            if self.parallel:
+                # Parallel mode: Fetch all remaining pages in parallel
+                with ThreadPoolExecutor() as executor:
+                    futures_with_offsets = [
+                        (i, executor.submit(
+                            self.general_search,
+                            variables=search_variables | {"offset": i},
+                            search_type=search_type,
+                        ))
+                        for i in range(
+                            self.offset + self.DEFAULT_PAGE_SIZE,
+                            min(total, self.offset + self.limit),
+                            self.DEFAULT_PAGE_SIZE,
+                        )
+                    ]

-        # Only launch parallel pagination if needed
-        if should_continue_pagination and self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
-            with ThreadPoolExecutor() as executor:
-                # Store futures with their offsets to maintain proper sort order
-                # Start from offset + page_size and go up to offset + limit
-                futures_with_offsets = [
-                    (i, executor.submit(
-                        self.general_search,
-                        variables=search_variables | {"offset": i},
+                    # Collect results and sort by offset to preserve API sort order
+                    results = []
+                    for offset, future in futures_with_offsets:
+                        results.append((offset, future.result()["properties"]))
+
+                    results.sort(key=lambda x: x[0])
+                    for offset, properties in results:
+                        homes.extend(properties)
+            else:
+                # Sequential mode: Fetch pages one by one with early termination checks
+                for current_offset in range(
+                    self.offset + self.DEFAULT_PAGE_SIZE,
+                    min(total, self.offset + self.limit),
+                    self.DEFAULT_PAGE_SIZE,
+                ):
+                    # Check if we should continue based on time-based filters
+                    if not self._should_fetch_more_pages(homes):
+                        break
+
+                    result = self.general_search(
+                        variables=search_variables | {"offset": current_offset},
                        search_type=search_type,
-                    ))
-                    for i in range(
-                        self.offset + self.DEFAULT_PAGE_SIZE,
-                        min(total, self.offset + self.limit),
-                        self.DEFAULT_PAGE_SIZE,
                    )
-                ]
-
-                # Collect results and sort by offset to preserve API sort order across pages
-                results = []
-                for offset, future in futures_with_offsets:
-                    results.append((offset, future.result()["properties"]))
-
-                # Sort by offset and concatenate in correct order
-                results.sort(key=lambda x: x[0])
-                for offset, properties in results:
-                    homes.extend(properties)
+                    page_properties = result["properties"]
+                    homes.extend(page_properties)

        # Apply client-side hour-based filtering if needed
        # (API only supports day-level filtering, so we post-filter for hour precision)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "homeharvest"
-version = "0.8.3"
+version = "0.8.5"
 description = "Real estate scraping library"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/ZacharyHampton/HomeHarvest"