Compare commits

...

2 Commits

Author SHA1 Message Date
Zachary Hampton
79b2b648f5 Fix sold listings not included when listing_type=None (issue #142)
When listing_type=None, sold listings were excluded despite documentation stating all types should be returned. This fix includes two changes:

1. Explicitly include common listing types (for_sale, for_rent, sold, pending, off_market) when listing_type=None instead of sending empty status parameter
2. Fix or_filters logic to only apply for PENDING when not mixed with other types like SOLD, preventing unintended filtering

Updated README documentation to accurately reflect that None returns common listing types rather than all 8 types.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 13:30:54 -08:00
Zachary Hampton
c2f01df1ad Add configurable parallel/sequential pagination with parallel parameter
- Add `parallel: bool = True` parameter to control pagination strategy
- Parallel mode (default): Fetches all pages in parallel for maximum speed
- Sequential mode: Fetches pages one-by-one with early termination checks
- Early termination stops pagination when time-based filters indicate no more matches
- Useful for rate limiting and narrow time windows
- Simplified pagination logic by removing hybrid first-page pre-check
- Updated README with usage example and parameter documentation
- Version bump to 0.8.4
- All 54 tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 10:36:47 -08:00
5 changed files with 85 additions and 37 deletions

View File

@@ -84,7 +84,7 @@ properties = scrape_property(
#### Sorting & Listing Types #### Sorting & Listing Types
```py ```py
# Sort options: list_price, list_date, sqft, beds, baths, last_update_date # Sort options: list_price, list_date, sqft, beds, baths, last_update_date
# Listing types: "for_sale", "for_rent", "sold", "pending", list, or None (all) # Listing types: "for_sale", "for_rent", "sold", "pending", "off_market", list, or None (common types)
properties = scrape_property( properties = scrape_property(
location="Miami, FL", location="Miami, FL",
listing_type=["for_sale", "pending"], # Single string, list, or None listing_type=["for_sale", "pending"], # Single string, list, or None
@@ -94,6 +94,17 @@ properties = scrape_property(
) )
``` ```
#### Pagination Control
```py
# Sequential mode with early termination (more efficient for narrow filters)
properties = scrape_property(
location="Los Angeles, CA",
listing_type="for_sale",
updated_in_past_hours=2, # Narrow time window
parallel=False # Fetch pages sequentially, stop when filters no longer match
)
```
## Output ## Output
```plaintext ```plaintext
>>> properties.head() >>> properties.head()
@@ -147,7 +158,7 @@ Required
│ - 'other' │ - 'other'
│ - 'ready_to_build' │ - 'ready_to_build'
│ - List of strings returns properties matching ANY status: ['for_sale', 'pending'] │ - List of strings returns properties matching ANY status: ['for_sale', 'pending']
│ - None returns all listing types │ - None returns common listing types (for_sale, for_rent, sold, pending, off_market)
Optional Optional
├── property_type (list): Choose the type of properties. ├── property_type (list): Choose the type of properties.
@@ -234,7 +245,9 @@ Optional
├── limit (integer): Limit the number of properties to fetch. Max & default is 10000. ├── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks. ── offset (integer): Starting position for pagination within the 10k limit. Use with limit to fetch results in chunks.
└── parallel (True/False): Controls pagination strategy. Default is True (fetch pages in parallel for speed). Set to False for sequential fetching with early termination (useful for rate limiting or narrow time windows).
``` ```
### Property Schema ### Property Schema

View File

@@ -48,6 +48,8 @@ def scrape_property(
# New sorting parameters # New sorting parameters
sort_by: str = None, sort_by: str = None,
sort_direction: str = "desc", sort_direction: str = "desc",
# Pagination control
parallel: bool = True,
) -> Union[pd.DataFrame, list[dict], list[Property]]: ) -> Union[pd.DataFrame, list[dict], list[Property]]:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@@ -96,6 +98,9 @@ def scrape_property(
:param year_built_min, year_built_max: Filter by year built :param year_built_min, year_built_max: Filter by year built
:param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date) :param sort_by: Sort results by field (list_date, sold_date, list_price, sqft, beds, baths, last_update_date)
:param sort_direction: Sort direction (asc, desc) :param sort_direction: Sort direction (asc, desc)
:param parallel: Controls pagination strategy. True (default) = fetch all pages in parallel for maximum speed.
False = fetch pages sequentially with early termination checks (useful for rate limiting or narrow time windows).
Sequential mode will stop paginating as soon as time-based filters indicate no more matches are possible.
Note: past_days and past_hours also accept timedelta objects for more Pythonic usage. Note: past_days and past_hours also accept timedelta objects for more Pythonic usage.
""" """
@@ -190,6 +195,8 @@ def scrape_property(
# New sorting # New sorting
sort_by=sort_by, sort_by=sort_by,
sort_direction=sort_direction, sort_direction=sort_direction,
# Pagination control
parallel=parallel,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)

View File

@@ -55,6 +55,9 @@ class ScraperInput(BaseModel):
sort_by: str | None = None sort_by: str | None = None
sort_direction: str = "desc" sort_direction: str = "desc"
# Pagination control
parallel: bool = True
class Scraper: class Scraper:
session = None session = None
@@ -141,6 +144,9 @@ class Scraper:
self.sort_by = scraper_input.sort_by self.sort_by = scraper_input.sort_by
self.sort_direction = scraper_input.sort_direction self.sort_direction = scraper_input.sort_direction
# Pagination control
self.parallel = scraper_input.parallel
def search(self) -> list[Union[Property | dict]]: ... def search(self) -> list[Union[Property | dict]]: ...
@staticmethod @staticmethod

View File

@@ -144,7 +144,15 @@ class RealtorScraper(Scraper):
# Determine date field based on listing type # Determine date field based on listing type
# Convert listing_type to list for uniform handling # Convert listing_type to list for uniform handling
if self.listing_type is None: if self.listing_type is None:
listing_types = [] # When None, return all common listing types as documented
# Note: NEW_COMMUNITY, OTHER, and READY_TO_BUILD are excluded as they typically return no results
listing_types = [
ListingType.FOR_SALE,
ListingType.FOR_RENT,
ListingType.SOLD,
ListingType.PENDING,
ListingType.OFF_MARKET,
]
date_field = None # When no listing_type is specified, skip date filtering date_field = None # When no listing_type is specified, skip date filtering
elif isinstance(self.listing_type, list): elif isinstance(self.listing_type, list):
listing_types = self.listing_type listing_types = self.listing_type
@@ -277,10 +285,14 @@ class RealtorScraper(Scraper):
else: else:
sort_param = "" #: prioritize normal fractal sort from realtor sort_param = "" #: prioritize normal fractal sort from realtor
# Handle PENDING with or_filters (applies if PENDING is in the list or is the single type) # Handle PENDING with or_filters
# Only use or_filters when PENDING is the only type or mixed only with FOR_SALE
# Using or_filters with other types (SOLD, FOR_RENT, etc.) will exclude those types
has_pending = ListingType.PENDING in listing_types has_pending = ListingType.PENDING in listing_types
other_types = [lt for lt in listing_types if lt not in [ListingType.PENDING, ListingType.FOR_SALE]]
use_or_filters = has_pending and len(other_types) == 0
pending_or_contingent_param = ( pending_or_contingent_param = (
"or_filters: { contingent: true, pending: true }" if has_pending else "" "or_filters: { contingent: true, pending: true }" if use_or_filters else ""
) )
# Build bucket parameter (only use fractal sort if no custom sort is specified) # Build bucket parameter (only use fractal sort if no custom sort is specified)
@@ -526,39 +538,49 @@ class RealtorScraper(Scraper):
total = result["total"] total = result["total"]
homes = result["properties"] homes = result["properties"]
# Pre-check: Should we continue pagination? # Fetch remaining pages based on parallel parameter
# This optimization prevents unnecessary API calls when using time-based filters if self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit):
# with date sorting. If page 1's last property is outside the time window, if self.parallel:
# all future pages will also be outside (due to sort order). # Parallel mode: Fetch all remaining pages in parallel
should_continue_pagination = self._should_fetch_more_pages(homes) with ThreadPoolExecutor() as executor:
futures_with_offsets = [
(i, executor.submit(
self.general_search,
variables=search_variables | {"offset": i},
search_type=search_type,
))
for i in range(
self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE,
)
]
# Only launch parallel pagination if needed # Collect results and sort by offset to preserve API sort order
if should_continue_pagination and self.offset + self.DEFAULT_PAGE_SIZE < min(total, self.offset + self.limit): results = []
with ThreadPoolExecutor() as executor: for offset, future in futures_with_offsets:
# Store futures with their offsets to maintain proper sort order results.append((offset, future.result()["properties"]))
# Start from offset + page_size and go up to offset + limit
futures_with_offsets = [ results.sort(key=lambda x: x[0])
(i, executor.submit( for offset, properties in results:
self.general_search, homes.extend(properties)
variables=search_variables | {"offset": i}, else:
# Sequential mode: Fetch pages one by one with early termination checks
for current_offset in range(
self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE,
):
# Check if we should continue based on time-based filters
if not self._should_fetch_more_pages(homes):
break
result = self.general_search(
variables=search_variables | {"offset": current_offset},
search_type=search_type, search_type=search_type,
))
for i in range(
self.offset + self.DEFAULT_PAGE_SIZE,
min(total, self.offset + self.limit),
self.DEFAULT_PAGE_SIZE,
) )
] page_properties = result["properties"]
homes.extend(page_properties)
# Collect results and sort by offset to preserve API sort order across pages
results = []
for offset, future in futures_with_offsets:
results.append((offset, future.result()["properties"]))
# Sort by offset and concatenate in correct order
results.sort(key=lambda x: x[0])
for offset, properties in results:
homes.extend(properties)
# Apply client-side hour-based filtering if needed # Apply client-side hour-based filtering if needed
# (API only supports day-level filtering, so we post-filter for hour precision) # (API only supports day-level filtering, so we post-filter for hour precision)

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.3" version = "0.8.5"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"