Fix sold listings not included when listing_type=None (issue #142 )

When listing_type=None, sold listings were excluded despite documentation stating all types should be returned. This fix includes two changes: 1. Explicitly include common listing types (for_sale, for_rent, sold, pending, off_market) when listing_type=None instead of sending empty status parameter 2. Fix or_filters logic to only apply for PENDING when not mixed with other types like SOLD, preventing unintended filtering Updated README documentation to accurately reflect that None returns common listing types rather than all 8 types. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
Add configurable parallel/sequential pagination with parallel parameter
2026-03-06 04:24:29 -08:00 · 2025-11-14 13:30:54 -08:00 · 2025-11-13 10:36:47 -08:00
5 changed files with 85 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ properties = scrape_property(
 #### Sorting & Listing Types
 ```py
 # Sort options: list_price, list_date, sqft, beds, baths, last_update_date
-# Listing types: "for_sale", "for_rent", "sold", "pending", list, or None (all)
+# Listing types: "for_sale", "for_rent", "sold", "pending", "off_market", list, or None (common types)
 properties = scrape_property(
    location="Miami, FL",
    listing_type=["for_sale", "pending"],  # Single string, list, or None
@@ -94,6 +94,17 @@ properties = scrape_property(
 )
 ```
 #### Pagination Control
 ```py
 # Sequential mode with early termination (more efficient for narrow filters)
 properties = scrape_property(
    location="Los Angeles, CA",
    listing_type="for_sale",
    updated_in_past_hours=2,  # Narrow time window
    parallel=False  # Fetch pages sequentially, stop when filters no longer match
 )
 ```
 ## Output
 ```plaintext
 >>> properties.head()
@@ -147,7 +158,7 @@ Required
 │    - 'other'
 │    - 'ready_to_build'
 │    - List of strings returns properties matching ANY status: ['for_sale', 'pending']
-│    - None returns all listing types
+│    - None returns common listing types (for_sale, for_rent, sold, pending, off_market)
 │
 Optional
 ├── property_type (list): Choose the type of properties.
@@ -234,7 +245,9 @@ Optional
 │
 ├── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
 │
-└── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
+├── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
 │
 └── parallel (True/False): Controls pagination strategy. Default is True (fetch pages in parallel for speed). Set to False for sequential fetching with early termination (useful for rate limiting or narrow time windows).
 ```
 ### Property Schema
--- a/homeharvest/init.py
+++ b/homeharvest/init.py
@@ -48,6 +48,8 @@ def scrape_property(
    # New sorting parameters
    sort_by: str = None,
    sort_direction: str = "desc",
    # Pagination control
    parallel: bool = True,
 ) -> Union[pd.DataFrame, list[dict], list[Property]]:
    """
    Scrape properties from Realtor.com based on a given location and listing type.
@@ -96,6 +98,9 @@ def scrape_property(
    :param year_built_min, year_built_max: Filter by year built
    :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
    :param sort_direction: Sort direction (asc, desc)
    :param parallel: Controls pagination strategy. True (default) = fetch all pages in parallel for maximum speed.
        False = fetch pages sequentially with early termination checks (useful for rate limiting or narrow time windows).
        Sequential mode will stop paginating as soon as time-based filters indicate no more matches are possible.
    Note: past_days and past_hours also accept timedelta objects for more Pythonic usage.
    """
@@ -190,6 +195,8 @@ def scrape_property(
        # New sorting
        sort_by=sort_by,
        sort_direction=sort_direction,
        # Pagination control
        parallel=parallel,
    )
    site = RealtorScraper(scraper_input)
--- a/homeharvest/core/scrapers/init.py
+++ b/homeharvest/core/scrapers/init.py
@@ -55,6 +55,9 @@ class ScraperInput(BaseModel):
    sort_by: str | None = None
    sort_direction: str = "desc"
    # Pagination control
    parallel: bool = True
 class Scraper:
    session = None
@@ -141,6 +144,9 @@ class Scraper:
        self.sort_by = scraper_input.sort_by
        self.sort_direction = scraper_input.sort_direction
        # Pagination control
        self.parallel = scraper_input.parallel
    def search(self) -> list[Union[Property | dict]]: ...
    @staticmethod
--- a/homeharvest/core/scrapers/realtor/init.py
+++ b/homeharvest/core/scrapers/realtor/init.py
@@ -144,7 +144,15 @@ class RealtorScraper(Scraper):
        # Determine date field based on listing type
        # Convert listing_type to list for uniform handling
        if self.listing_type is None:
-            listing_types = []
+            # When None, return all common listing types as documented
            # Note: NEW_COMMUNITY, OTHER, and READY_TO_BUILD are excluded as they typically return no results
            listing_types = [
                ListingType.FOR_SALE,
                ListingType.FOR_RENT,
                ListingType.SOLD,
                ListingType.PENDING,
                ListingType.OFF_MARKET,
            ]
            date_field = None  # When no listing_type is specified, skip date filtering
        elif isinstance(self.listing_type, list):
            listing_types = self.listing_type
@@ -277,10 +285,14 @@ class RealtorScraper(Scraper):
        else:
            sort_param = ""  #: prioritize normal fractal sort from realtor
-        # Handle PENDING with or_filters (applies if PENDING is in the list or is the single type)
+        # Handle PENDING with or_filters
        # Only use or_filters when PENDING is the only type or mixed only with FOR_SALE
        # Using or_filters with other types (SOLD, FOR_RENT, etc.) will exclude those types
        has_pending = ListingType.PENDING in listing_types
        other_types = [lt for lt in listing_types if lt not in [ListingType.PENDING, ListingType.FOR_SALE]]
        use_or_filters = has_pending and len(other_types) == 0
        pending_or_contingent_param = (
-            "or_filters: { contingent: true, pending: true }" if has_pending else ""
+            "or_filters: { contingent: true, pending: true }" if use_or_filters else ""
        )
        # Build bucket parameter (only use fractal sort if no custom sort is specified)
@@ -526,39 +538,49 @@ class RealtorScraper(Scraper):
        total = result["total"]
        homes = result["properties"]
-        # Pre-check: Should we continue pagination?
+        # Fetch remaining pages based on parallel parameter
-        # This optimization prevents unnecessary API calls when using time-based filters
+        if self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
-        # with date sorting. If page 1's last property is outside the time window,
+            if self.parallel:
-        # all future pages will also be outside (due to sort order).
+                # Parallel mode: Fetch all remaining pages in parallel
-        should_continue_pagination = self._should_fetch_more_pages(homes)
+                with ThreadPoolExecutor() as executor:
                    futures_with_offsets = [
                        (i, executor.submit(
                            self.general_search,
                            variables=search_variables | {"offset": i},
                            search_type=search_type,
                        ))
                        for i in range(
                            self.offset + self.DEFAULT_PAGE_SIZE,
                            min(total, self.offset + self.limit),
                            self.DEFAULT_PAGE_SIZE,
                        )
                    ]
-        # Only launch parallel pagination if needed
+                    # Collect results and sort by offset to preserve API sort order
-        if should_continue_pagination and self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
+                    results = []
-            with ThreadPoolExecutor() as executor:
+                    for offset, future in futures_with_offsets:
-                # Store futures with their offsets to maintain proper sort order
+                        results.append((offset, future.result()["properties"]))
-                # Start from offset + page_size and go up to offset + limit
+
-                futures_with_offsets = [
+                    results.sort(key=lambda x: x[0])
-                    (i, executor.submit(
+                    for offset, properties in results:
-                        self.general_search,
+                        homes.extend(properties)
-                        variables=search_variables | {"offset": i},
+            else:
                # Sequential mode: Fetch pages one by one with early termination checks
                for current_offset in range(
                    self.offset + self.DEFAULT_PAGE_SIZE,
                    min(total, self.offset + self.limit),
                    self.DEFAULT_PAGE_SIZE,
                ):
                    # Check if we should continue based on time-based filters
                    if not self._should_fetch_more_pages(homes):
                        break
                    result = self.general_search(
                        variables=search_variables | {"offset": current_offset},
                        search_type=search_type,
                    ))
                    for i in range(
                        self.offset + self.DEFAULT_PAGE_SIZE,
                        min(total, self.offset + self.limit),
                        self.DEFAULT_PAGE_SIZE,
                    )
-                ]
+                    page_properties = result["properties"]
-
+                    homes.extend(page_properties)
                # Collect results and sort by offset to preserve API sort order across pages
                results = []
                for offset, future in futures_with_offsets:
                    results.append((offset, future.result()["properties"]))
                # Sort by offset and concatenate in correct order
                results.sort(key=lambda x: x[0])
                for offset, properties in results:
                    homes.extend(properties)
        # Apply client-side hour-based filtering if needed
        # (API only supports day-level filtering, so we post-filter for hour precision)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "homeharvest"
-version = "0.8.3"
+version = "0.8.5"
 description = "Real estate scraping library"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/ZacharyHampton/HomeHarvest"