From c2f01df1ad9bbe7f0a5b653e2a334efbeb688725 Mon Sep 17 00:00:00 2001 From: Zachary Hampton Date: Thu, 13 Nov 2025 10:36:47 -0800 Subject: [PATCH] Add configurable parallel/sequential pagination with `parallel` parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `parallel: bool = True` parameter to control pagination strategy - Parallel mode (default): Fetches all pages in parallel for maximum speed - Sequential mode: Fetches pages one-by-one with early termination checks - Early termination stops pagination when time-based filters indicate no more matches - Useful for rate limiting and narrow time windows - Simplified pagination logic by removing hybrid first-page pre-check - Updated README with usage example and parameter documentation - Version bump to 0.8.4 - All 54 tests passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- README.md | 15 +++- homeharvest/__init__.py | 7 ++ homeharvest/core/scrapers/__init__.py | 6 ++ homeharvest/core/scrapers/realtor/__init__.py | 70 +++++++++++-------- pyproject.toml | 2 +- 5 files changed, 68 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index f51fcd3..965f6bc 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,17 @@ properties = scrape_property( ) ``` +#### Pagination Control +```py +# Sequential mode with early termination (more efficient for narrow filters) +properties = scrape_property( + location="Los Angeles, CA", + listing_type="for_sale", + updated_in_past_hours=2, # Narrow time window + parallel=False # Fetch pages sequentially, stop when filters no longer match +) +``` + ## Output ```plaintext >>> properties.head() @@ -234,7 +245,9 @@ Optional │ ├── limit (integer): Limit the number of properties to fetch. Max & default is 10000. │ -└── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks. +├── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks. +│ +└── parallel (True/False): Controls pagination strategy. Default is True (fetch pages in parallel for speed). Set to False for sequential fetching with early termination (useful for rate limiting or narrow time windows). ``` ### Property Schema diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 1436e10..1ce960b 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -48,6 +48,8 @@ def scrape_property( # New sorting parameters sort_by: str = None, sort_direction: str = "desc", + # Pagination control + parallel: bool = True, ) -> Union[pd.DataFrame, list[dict], list[Property]]: """ Scrape properties from Realtor.com based on a given location and listing type. @@ -96,6 +98,9 @@ def scrape_property( :param year_built_min, year_built_max: Filter by year built :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date) :param sort_direction: Sort direction (asc, desc) + :param parallel: Controls pagination strategy. True (default) = fetch all pages in parallel for maximum speed. + False = fetch pages sequentially with early termination checks (useful for rate limiting or narrow time windows). + Sequential mode will stop paginating as soon as time-based filters indicate no more matches are possible. Note: past_days and past_hours also accept timedelta objects for more Pythonic usage. """ @@ -190,6 +195,8 @@ def scrape_property( # New sorting sort_by=sort_by, sort_direction=sort_direction, + # Pagination control + parallel=parallel, ) site = RealtorScraper(scraper_input) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 8ad2051..c9b4f3b 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -55,6 +55,9 @@ class ScraperInput(BaseModel): sort_by: str | None = None sort_direction: str = "desc" + # Pagination control + parallel: bool = True + class Scraper: session = None @@ -141,6 +144,9 @@ class Scraper: self.sort_by = scraper_input.sort_by self.sort_direction = scraper_input.sort_direction + # Pagination control + self.parallel = scraper_input.parallel + def search(self) -> list[Union[Property | dict]]: ... @staticmethod diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index ea3a63b..f617965 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -526,39 +526,49 @@ class RealtorScraper(Scraper): total = result["total"] homes = result["properties"] - # Pre-check: Should we continue pagination? - # This optimization prevents unnecessary API calls when using time-based filters - # with date sorting. If page 1's last property is outside the time window, - # all future pages will also be outside (due to sort order). - should_continue_pagination = self._should_fetch_more_pages(homes) + # Fetch remaining pages based on parallel parameter + if self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit): + if self.parallel: + # Parallel mode: Fetch all remaining pages in parallel + with ThreadPoolExecutor() as executor: + futures_with_offsets = [ + (i, executor.submit( + self.general_search, + variables=search_variables | {"offset": i}, + search_type=search_type, + )) + for i in range( + self.offset + self.DEFAULT_PAGE_SIZE, + min(total, self.offset + self.limit), + self.DEFAULT_PAGE_SIZE, + ) + ] - # Only launch parallel pagination if needed - if should_continue_pagination and self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit): - with ThreadPoolExecutor() as executor: - # Store futures with their offsets to maintain proper sort order - # Start from offset + page_size and go up to offset + limit - futures_with_offsets = [ - (i, executor.submit( - self.general_search, - variables=search_variables | {"offset": i}, + # Collect results and sort by offset to preserve API sort order + results = [] + for offset, future in futures_with_offsets: + results.append((offset, future.result()["properties"])) + + results.sort(key=lambda x: x[0]) + for offset, properties in results: + homes.extend(properties) + else: + # Sequential mode: Fetch pages one by one with early termination checks + for current_offset in range( + self.offset + self.DEFAULT_PAGE_SIZE, + min(total, self.offset + self.limit), + self.DEFAULT_PAGE_SIZE, + ): + # Check if we should continue based on time-based filters + if not self._should_fetch_more_pages(homes): + break + + result = self.general_search( + variables=search_variables | {"offset": current_offset}, search_type=search_type, - )) - for i in range( - self.offset + self.DEFAULT_PAGE_SIZE, - min(total, self.offset + self.limit), - self.DEFAULT_PAGE_SIZE, ) - ] - - # Collect results and sort by offset to preserve API sort order across pages - results = [] - for offset, future in futures_with_offsets: - results.append((offset, future.result()["properties"])) - - # Sort by offset and concatenate in correct order - results.sort(key=lambda x: x[0]) - for offset, properties in results: - homes.extend(properties) + page_properties = result["properties"] + homes.extend(page_properties) # Apply client-side hour-based filtering if needed # (API only supports day-level filtering, so we post-filter for hour precision) diff --git a/pyproject.toml b/pyproject.toml index 09e160f..3320385 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.8.3" +version = "0.8.4" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest"