Add configurable parallel/sequential pagination with parallel parameter

- Add `parallel: bool = True` parameter to control pagination strategy
- Parallel mode (default): Fetches all pages in parallel for maximum speed
- Sequential mode: Fetches pages one-by-one with early termination checks
- Early termination stops pagination when time-based filters indicate no more matches
- Useful for rate limiting and narrow time windows
- Simplified pagination logic by removing hybrid first-page pre-check
- Updated README with usage example and parameter documentation
- Version bump to 0.8.4
- All 54 tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Zachary Hampton
2025-11-13 10:36:47 -08:00
parent 9b61a89c77
commit c2f01df1ad
5 changed files with 68 additions and 32 deletions

View File

@@ -94,6 +94,17 @@ properties = scrape_property(
) )
``` ```
#### Pagination Control
```py
# Sequential mode with early termination (more efficient for narrow filters)
properties = scrape_property(
location="Los Angeles, CA",
listing_type="for_sale",
updated_in_past_hours=2, # Narrow time window
parallel=False # Fetch pages sequentially, stop when filters no longer match
)
```
## Output ## Output
```plaintext ```plaintext
>>> properties.head() >>> properties.head()
@@ -234,7 +245,9 @@ Optional
├── limit (integer): Limit the number of properties to fetch. Max & default is 10000. ├── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks. ── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
└── parallel (True/False): Controls pagination strategy. Default is True (fetch pages in parallel for speed). Set to False for sequential fetching with early termination (useful for rate limiting or narrow time windows).
``` ```
### Property Schema ### Property Schema

View File

@@ -48,6 +48,8 @@ def scrape_property(
# New sorting parameters # New sorting parameters
sort_by: str = None, sort_by: str = None,
sort_direction: str = "desc", sort_direction: str = "desc",
# Pagination control
parallel: bool = True,
) -> Union[pd.DataFrame, list[dict], list[Property]]: ) -> Union[pd.DataFrame, list[dict], list[Property]]:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@@ -96,6 +98,9 @@ def scrape_property(
:param year_built_min, year_built_max: Filter by year built :param year_built_min, year_built_max: Filter by year built
:param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date) :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
:param sort_direction: Sort direction (asc, desc) :param sort_direction: Sort direction (asc, desc)
:param parallel: Controls pagination strategy. True (default) = fetch all pages in parallel for maximum speed.
False = fetch pages sequentially with early termination checks (useful for rate limiting or narrow time windows).
Sequential mode will stop paginating as soon as time-based filters indicate no more matches are possible.
Note: past_days and past_hours also accept timedelta objects for more Pythonic usage. Note: past_days and past_hours also accept timedelta objects for more Pythonic usage.
""" """
@@ -190,6 +195,8 @@ def scrape_property(
# New sorting # New sorting
sort_by=sort_by, sort_by=sort_by,
sort_direction=sort_direction, sort_direction=sort_direction,
# Pagination control
parallel=parallel,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)

View File

@@ -55,6 +55,9 @@ class ScraperInput(BaseModel):
sort_by: str | None = None sort_by: str | None = None
sort_direction: str = "desc" sort_direction: str = "desc"
# Pagination control
parallel: bool = True
class Scraper: class Scraper:
session = None session = None
@@ -141,6 +144,9 @@ class Scraper:
self.sort_by = scraper_input.sort_by self.sort_by = scraper_input.sort_by
self.sort_direction = scraper_input.sort_direction self.sort_direction = scraper_input.sort_direction
# Pagination control
self.parallel = scraper_input.parallel
def search(self) -> list[Union[Property | dict]]: ... def search(self) -> list[Union[Property | dict]]: ...
@staticmethod @staticmethod

View File

@@ -526,17 +526,11 @@ class RealtorScraper(Scraper):
total = result["total"] total = result["total"]
homes = result["properties"] homes = result["properties"]
# Pre-check: Should we continue pagination? # Fetch remaining pages based on parallel parameter
# This optimization prevents unnecessary API calls when using time-based filters if self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
# with date sorting. If page 1's last property is outside the time window, if self.parallel:
# all future pages will also be outside (due to sort order). # Parallel mode: Fetch all remaining pages in parallel
should_continue_pagination = self._should_fetch_more_pages(homes)
# Only launch parallel pagination if needed
if should_continue_pagination and self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
# Store futures with their offsets to maintain proper sort order
# Start from offset + page_size and go up to offset + limit
futures_with_offsets = [ futures_with_offsets = [
(i, executor.submit( (i, executor.submit(
self.general_search, self.general_search,
@@ -550,15 +544,31 @@ class RealtorScraper(Scraper):
) )
] ]
# Collect results and sort by offset to preserve API sort order across pages # Collect results and sort by offset to preserve API sort order
results = [] results = []
for offset, future in futures_with_offsets: for offset, future in futures_with_offsets:
results.append((offset, future.result()["properties"])) results.append((offset, future.result()["properties"]))
# Sort by offset and concatenate in correct order
results.sort(key=lambda x: x[0]) results.sort(key=lambda x: x[0])
for offset, properties in results: for offset, properties in results:
homes.extend(properties) homes.extend(properties)
else:
# Sequential mode: Fetch pages one by one with early termination checks
for current_offset in range(
self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE,
):
# Check if we should continue based on time-based filters
if not self._should_fetch_more_pages(homes):
break
result = self.general_search(
variables=search_variables | {"offset": current_offset},
search_type=search_type,
)
page_properties = result["properties"]
homes.extend(page_properties)
# Apply client-side hour-based filtering if needed # Apply client-side hour-based filtering if needed
# (API only supports day-level filtering, so we post-filter for hour precision) # (API only supports day-level filtering, so we post-filter for hour precision)

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.3" version = "0.8.4"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"