Compare commits

...

9 Commits

Author SHA1 Message Date
zacharyhampton
cab0216f29 Version bump to 0.8.13
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 12:30:46 -07:00
zacharyhampton
8ee720ce5c Version bump to 0.8.12
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-20 15:30:26 -07:00
zacharyhampton
8eb138ee1a Version bump to 0.8.11
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 22:42:01 -07:00
Zachary Hampton
ef6db606fd Version bump to 0.8.10
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-16 18:32:33 -08:00
zacharyhampton
9406c92a66 Version bump to 0.8.9
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 17:55:33 -08:00
zacharyhampton
fefacdd264 Version bump to 0.8.8
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 17:32:06 -08:00
Zachary Hampton
3579c10196 Merge pull request #147 from ZacharyHampton/feature/ios-mobile-headers
Improve API stability and reliability
2025-12-05 19:30:25 -08:00
Zachary Hampton
f5784e0191 Update to iOS mobile app headers for improved API stability
- Replace browser-based headers with iOS mobile app headers
- Update GraphQL query names to match iOS app conventions (1:1 alignment)
- Add _graphql_post() wrapper to centralize GraphQL calls with dynamic operation names
- Simplify session management by removing unnecessary thread-local complexity
- Add test_parallel_search_consistency test to verify concurrent request stability
- Bump version from 0.8.6b to 0.8.7

Changes fix API flakiness under concurrent load - parallel consistency test now passes 100% (5/5 runs).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 19:27:47 -08:00
Zachary Hampton
57093f5d17 Merge pull request #145 from ZacharyHampton/fix/realtor-403-error
Fix 403 error from Realtor.com API changes
2025-12-04 23:10:32 -08:00
5 changed files with 563 additions and 102 deletions

View File

@@ -5,6 +5,7 @@ import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import uuid import uuid
import secrets
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
import json import json
@@ -73,35 +74,30 @@ class Scraper:
if not self.session: if not self.session:
Scraper.session = requests.Session() Scraper.session = requests.Session()
retries = Retry( retries = Retry(
total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"]) total=3, backoff_factor=4, status_forcelist=[429], allowed_methods=frozenset(["GET", "POST"])
) )
adapter = HTTPAdapter(max_retries=retries) adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
Scraper.session.mount("http://", adapter) Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter) Scraper.session.mount("https://", adapter)
Scraper.session.headers.update( Scraper.session.headers.update(
{ {
'sec-ch-ua-platform': '"macOS"', 'Content-Type': 'application/json',
'rdc-client-name': 'rdc-search-for-sale-desktop', 'apollographql-client-version': '26.11.1-26.11.1.1106489',
'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"', 'Accept': '*/*',
'sec-ch-ua-mobile': '?0', 'Accept-Language': 'en-US,en;q=0.9',
'rdc-client-version': '0.1.0', 'rdc-client-version': '26.11.1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'X-APOLLO-OPERATION-TYPE': 'query',
'accept': 'application/json', 'X-APOLLO-OPERATION-ID': secrets.token_hex(32),
'content-type': 'application/json', 'rdc-client-name': 'RDC_NATIVE_MOBILE-iPhone-com.move.Realtor',
'origin': 'https://www.realtor.com', 'apollographql-client-name': 'com.move.Realtor-apollo-ios',
'sec-fetch-site': 'same-site', 'User-Agent': 'Realtor.com/26.11.1.1106489 CFNetwork/3860.200.71 Darwin/25.1.0',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.realtor.com/',
'accept-language': 'en-US,en;q=0.9',
'priority': 'u=1, i',
} }
) )
if scraper_input.proxy: self.proxy = scraper_input.proxy
proxy_url = scraper_input.proxy if self.proxy:
proxies = {"http": proxy_url, "https": proxy_url} proxies = {"http": self.proxy, "https": self.proxy}
self.session.proxies.update(proxies) self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type

View File

@@ -8,6 +8,7 @@ This module implements the scraper for realtor.com
from __future__ import annotations from __future__ import annotations
import json import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
@@ -21,12 +22,13 @@ from tenacity import (
) )
from .. import Scraper from .. import Scraper
from ....exceptions import AuthenticationError
from ..models import ( from ..models import (
Property, Property,
ListingType, ListingType,
ReturnType ReturnType
) )
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT, MORPHEUS_SUGGESTIONS_QUERY
from .processors import ( from .processors import (
process_property, process_property,
process_extra_property_details, process_extra_property_details,
@@ -42,40 +44,59 @@ class RealtorScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
def handle_location(self): @staticmethod
query = """query Search_suggestions($searchInput: SearchSuggestionsInput!) { def _minify_query(query: str) -> str:
search_suggestions(search_input: $searchInput) { """Minify GraphQL query by collapsing whitespace to single spaces."""
geo_results { # Split on whitespace, filter empty strings, join with single space
type return ' '.join(query.split())
text
geo {
_id
area_type
city
state_code
postal_code
county
centroid { lat lon }
slug_id
geo_id
}
}
}
}"""
def _graphql_post(self, query: str, variables: dict, operation_name: str) -> dict:
"""
Execute a GraphQL query with operation-specific headers.
Args:
query: GraphQL query string (must include operationName matching operation_name param)
variables: Query variables dictionary
operation_name: Name of the GraphQL operation for Apollo headers
Returns:
Response JSON dictionary
"""
# Set operation-specific header (must match query's operationName)
self.session.headers['X-APOLLO-OPERATION-NAME'] = operation_name
payload = {
"operationName": operation_name, # Include in payload
"query": self._minify_query(query),
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, data=json.dumps(payload, separators=(',', ':')))
if response.status_code == 403:
if not self.proxy:
raise AuthenticationError(
"Received 403 Forbidden from Realtor.com API.",
response=response
)
else:
raise Exception("Received 403 Forbidden, retrying...")
return response.json()
@retry(
retry=retry_if_exception_type(Exception),
wait=wait_exponential(multiplier=1, min=1, max=4),
stop=stop_after_attempt(3),
)
def handle_location(self):
variables = { variables = {
"searchInput": { "searchInput": {
"search_term": self.location "search_term": self.location
} }
} }
payload = { response_json = self._graphql_post(MORPHEUS_SUGGESTIONS_QUERY, variables, "GetMorpheusSuggestions")
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
if ( if (
response_json is None response_json is None
@@ -86,6 +107,11 @@ class RealtorScraper(Scraper):
or "geo_results" not in response_json["data"]["search_suggestions"] or "geo_results" not in response_json["data"]["search_suggestions"]
or not response_json["data"]["search_suggestions"]["geo_results"] or not response_json["data"]["search_suggestions"]["geo_results"]
): ):
# If we got a 400 error with "Required parameter is missing", raise to trigger retry
if response_json and "errors" in response_json:
error_msgs = [e.get("message", "") for e in response_json.get("errors", [])]
if any("Required parameter is missing" in msg for msg in error_msgs):
raise Exception(f"Transient API error: {error_msgs}")
return None return None
geo_result = response_json["data"]["search_suggestions"]["geo_results"][0] geo_result = response_json["data"]["search_suggestions"]["geo_results"][0]
@@ -109,24 +135,22 @@ class RealtorScraper(Scraper):
return result return result
def get_latest_listing_id(self, property_id: str) -> str | None: def get_latest_listing_id(self, property_id: str) -> str | None:
query = """query Property($property_id: ID!) { query = """
fragment ListingFragment on Listing {
listing_id
primary
}
query GetPropertyListingId($property_id: ID!) {
property(id: $property_id) { property(id: $property_id) {
listings { listings {
listing_id ...ListingFragment
primary
} }
} }
} }
""" """
variables = {"property_id": property_id} variables = {"property_id": property_id}
payload = { response_json = self._graphql_post(query, variables, "GetPropertyListingId")
"query": query,
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
property_info = response_json["data"]["property"] property_info = response_json["data"]["property"]
if property_info["listings"] is None: if property_info["listings"] is None:
@@ -144,18 +168,19 @@ class RealtorScraper(Scraper):
def handle_home(self, property_id: str) -> list[Property]: def handle_home(self, property_id: str) -> list[Property]:
"""Fetch single home with proper error handling.""" """Fetch single home with proper error handling."""
query = ( query = (
"""query Home($property_id: ID!) { """%s
home(property_id: $property_id) %s query GetHomeDetails($property_id: ID!) {
home(property_id: $property_id) {
...HomeDetailsFragment
}
}""" }"""
% HOMES_DATA % HOME_FRAGMENT
) )
variables = {"property_id": property_id} variables = {"property_id": property_id}
payload = {"query": query, "variables": variables}
try: try:
response = self.session.post(self.SEARCH_GQL_URL, json=payload) data = self._graphql_post(query, variables, "GetHomeDetails")
data = response.json()
# Check for errors or missing data # Check for errors or missing data
if "errors" in data or "data" not in data: if "errors" in data or "data" not in data:
@@ -374,12 +399,12 @@ class RealtorScraper(Scraper):
is_foreclosure = "foreclosure: false" is_foreclosure = "foreclosure: false"
if search_type == "comps": #: comps search, came from an address if search_type == "comps": #: comps search, came from an address
query = """query Property_search( query = """query GetHomeSearch(
$coordinates: [Float]! $coordinates: [Float]!
$radius: String! $radius: String!
$offset: Int!, $offset: Int!,
) { ) {
home_search( homeSearch: home_search(
query: { query: {
%s %s
nearby: { nearby: {
@@ -396,7 +421,9 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s ) %s
}""" % ( }
%s
%s""" % (
is_foreclosure, is_foreclosure,
status_param, status_param,
date_param, date_param,
@@ -405,13 +432,15 @@ class RealtorScraper(Scraper):
pending_or_contingent_param, pending_or_contingent_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
SEARCH_RESULTS_FRAGMENT,
LISTING_PHOTOS_FRAGMENT,
) )
elif search_type == "area": #: general search, came from a general location elif search_type == "area": #: general search, came from a general location
query = """query Home_search( query = """query GetHomeSearch(
$search_location: SearchLocation, $search_location: SearchLocation,
$offset: Int, $offset: Int,
) { ) {
home_search( homeSearch: home_search(
query: { query: {
%s %s
search_location: $search_location search_location: $search_location
@@ -426,7 +455,9 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s ) %s
}""" % ( }
%s
%s""" % (
is_foreclosure, is_foreclosure,
status_param, status_param,
date_param, date_param,
@@ -436,32 +467,30 @@ class RealtorScraper(Scraper):
bucket_param, bucket_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
SEARCH_RESULTS_FRAGMENT,
LISTING_PHOTOS_FRAGMENT,
) )
else: #: general search, came from an address else: #: general search, came from an address
query = ( query = (
"""query Property_search( """query GetHomeSearch(
$property_id: [ID]! $property_id: [ID]!
$offset: Int!, $offset: Int!,
) { ) {
home_search( homeSearch: home_search(
query: { query: {
property_id: $property_id property_id: $property_id
} }
limit: 1 limit: 1
offset: $offset offset: $offset
) %s ) %s
}""" }
% GENERAL_RESULTS_QUERY %s
%s"""
% (GENERAL_RESULTS_QUERY, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT)
) )
payload = { response_json = self._graphql_post(query, variables, "GetHomeSearch")
"query": query, search_key = "homeSearch"
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search"
properties: list[Union[Property, dict]] = [] properties: list[Union[Property, dict]] = []
@@ -1081,8 +1110,8 @@ class RealtorScraper(Scraper):
@retry( @retry(
retry=retry_if_exception_type(JSONDecodeError), retry=retry_if_exception_type((JSONDecodeError, Exception)),
wait=wait_exponential(min=4, max=10), wait=wait_exponential(multiplier=1, min=1, max=10),
stop=stop_after_attempt(3), stop=stop_after_attempt(3),
) )
def get_bulk_prop_details(self, property_ids: list[str]) -> dict: def get_bulk_prop_details(self, property_ids: list[str]) -> dict:
@@ -1095,24 +1124,27 @@ class RealtorScraper(Scraper):
property_ids = list(set(property_ids)) property_ids = list(set(property_ids))
# Construct the bulk query
fragments = "\n".join( fragments = "\n".join(
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}' f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeDetailsFragment }}'
for property_id in property_ids for property_id in property_ids
) )
query = f"""{HOME_FRAGMENT} query = f"""{HOME_FRAGMENT}
query GetHomes {{
{fragments}
}}"""
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query}) query GetHomeDetails {{
data = response.json() {fragments}
}}"""
if "data" not in data: data = self._graphql_post(query, {}, "GetHomeDetails")
if "data" not in data or data["data"] is None:
# If we got a 400 error with "Required parameter is missing", raise to trigger retry
if data and "errors" in data:
error_msgs = [e.get("message", "") for e in data.get("errors", [])]
if any("Required parameter is missing" in msg for msg in error_msgs):
raise Exception(f"Transient API error: {error_msgs}")
return {} return {}
properties = data["data"] properties = data["data"]
return {data.replace('home_', ''): properties[data] for data in properties if properties[data]} return {key.replace('home_', ''): properties[key] for key in properties if properties[key]}

View File

@@ -1,3 +1,193 @@
SEARCH_RESULTS_FRAGMENT = """
fragment SearchFragment on SearchHome {
__typename
pending_date
listing_id
property_id
href
permalink
list_date
status
mls_status
last_sold_price
last_sold_date
last_status_change_date
last_update_date
list_price
list_price_max
list_price_min
price_per_sqft
tags
open_houses {
start_date
end_date
description
time_zone
dst
href
methods
}
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
photos(https: true) {
title
href
tags {
label
}
}
list_price
__typename
}
flags {
is_contingent
is_pending
is_new_construction
}
description {
type
sqft
beds
baths_full
baths_half
lot_sqft
year_built
garage
type
name
stories
text
}
source {
id
listing_id
}
hoa {
fee
}
location {
address {
street_direction
street_number
street_name
street_suffix
line
unit
city
state_code
postal_code
coordinate {
lon
lat
}
}
county {
name
fips_code
}
neighborhoods {
name
}
}
tax_record {
cl_id
public_record_id
last_update_date
apn
tax_parcel_id
}
primary_photo(https: true) {
href
}
advertisers {
email
broker {
name
fulfillment_id
}
type
name
fulfillment_id
builder {
name
fulfillment_id
}
phones {
ext
primary
type
number
}
office {
name
email
fulfillment_id
href
phones {
number
type
primary
ext
}
mls_set
}
corporation {
specialties
name
bio
href
fulfillment_id
}
mls_set
nrds_id
state_license
rental_corporation {
fulfillment_id
}
rental_management {
name
href
fulfillment_id
}
}
current_estimates {
__typename
source {
__typename
type
name
}
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
"""
_SEARCH_HOMES_DATA_BASE = """{ _SEARCH_HOMES_DATA_BASE = """{
pending_date pending_date
listing_id listing_id
@@ -181,8 +371,189 @@ _SEARCH_HOMES_DATA_BASE = """{
HOME_FRAGMENT = """ HOME_FRAGMENT = """
fragment HomeData on Home { fragment HomeDetailsFragment on Home {
__typename
pending_date
listing_id
property_id property_id
href
permalink
list_date
status
mls_status
last_sold_price
last_sold_date
last_status_change_date
last_update_date
list_price
list_price_max
list_price_min
price_per_sqft
tags
open_houses {
start_date
end_date
description
time_zone
dst
href
methods
}
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
photos(https: true) {
title
href
tags {
label
}
}
list_price
__typename
}
flags {
is_contingent
is_pending
is_new_construction
}
description {
type
sqft
beds
baths_full
baths_half
lot_sqft
year_built
garage
type
name
stories
text
}
source {
id
listing_id
}
hoa {
fee
}
location {
address {
street_direction
street_number
street_name
street_suffix
line
unit
city
state_code
postal_code
coordinate {
lon
lat
}
}
county {
name
fips_code
}
neighborhoods {
name
}
parcel {
parcel_id
}
}
tax_record {
cl_id
public_record_id
last_update_date
apn
tax_parcel_id
}
primary_photo(https: true) {
href
}
photos(https: true) {
title
href
tags {
label
}
}
advertisers {
email
broker {
name
fulfillment_id
}
type
name
fulfillment_id
builder {
name
fulfillment_id
}
phones {
ext
primary
type
number
}
office {
name
email
fulfillment_id
href
phones {
number
type
primary
ext
}
mls_set
}
corporation {
specialties
name
bio
href
fulfillment_id
}
mls_set
nrds_id
state_license
rental_corporation {
fulfillment_id
}
rental_management {
name
href
fulfillment_id
}
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } } __typename schools { district { __typename id name } }
} }
@@ -198,11 +569,6 @@ fragment HomeData on Home {
last_n_days last_n_days
} }
} }
location {
parcel {
parcel_id
}
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } } taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
property_history { property_history {
date date
@@ -227,6 +593,18 @@ fragment HomeData on Home {
text text
category category
} }
estimates {
__typename
currentValues: current_values {
__typename
source { __typename type name }
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
} }
""" """
@@ -300,8 +678,38 @@ current_estimates {
} }
}""" % _SEARCH_HOMES_DATA_BASE }""" % _SEARCH_HOMES_DATA_BASE
GENERAL_RESULTS_QUERY = """{ # Query body using inline fields (kept for backward compatibility)
GENERAL_RESULTS_QUERY_BODY = """{
count count
total total
results %s results %s
}""" % SEARCH_HOMES_DATA }""" % SEARCH_HOMES_DATA
GENERAL_RESULTS_QUERY = """{
__typename
count
total
results {
__typename
...SearchFragment
...ListingPhotosFragment
}
}"""
LISTING_PHOTOS_FRAGMENT = """
fragment ListingPhotosFragment on SearchHome {
__typename
photos(https: true) {
__typename
title
href
tags {
__typename
label
probability
}
}
}
"""
MORPHEUS_SUGGESTIONS_QUERY = """query GetMorpheusSuggestions($searchInput: SearchSuggestionsInput!) { search_suggestions(search_input: $searchInput) { __typename geo_results { __typename type text geo { __typename _id _score mpr_id area_type city state_code postal_code country lat lon county counties { __typename name fips state_code } slug_id geo_id score name city_slug_id centroid { __typename lat lon } county_needed_for_uniq street line school school_id school_district school_district_id has_catchment university university_id neighborhood park } } no_matches has_results filter_criteria { __typename property_type { __typename type } price { __typename min max pattern } bed { __typename min max pattern } bath { __typename min max pattern } feature_tags { __typename tags } listing_status { __typename new_construction existing_homes foreclosures recently_sold fifty_five_plus open_house hide_new_construction hide_existing_homes hide_foreclosures hide_recently_sold hide_fifty_five_plus hide_open_house virtual_tour three_d_tour contingent hide_contingent pending hide_pending } keyword { __typename keywords } garage { __typename min max pattern } age { __typename min max pattern } stories { __typename min max pattern } lot_size { __typename min max pattern } square_feet { __typename min max pattern } home_size { __typename min max pattern } basement finished_basement pool waterfront fireplace detached_garage expand { __typename radius } hoa { __typename type fee } } message_data { __typename property_type pool waterfront fireplace basement finished_basement detached_garage listing_status { __typename new_construction existing_homes foreclosures recently_sold fifty_five_plus open_house hide_new_construction hide_existing_homes hide_foreclosures hide_recently_sold hide_fifty_five_plus hide_open_house } keywords price { __typename min max pattern } bed { __typename min max pattern } bath { __typename min max pattern } garage { __typename min max pattern } stories { __typename min max pattern } age { __typename min max pattern } lot_size { __typename min max pattern } square_feet { __typename min max pattern } } original_string morpheus_context } }"""

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.6b" version = "0.8.13"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@@ -1,4 +1,5 @@
import pytz import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
from homeharvest import scrape_property, Property from homeharvest import scrape_property, Property
import pandas as pd import pandas as pd
@@ -307,6 +308,30 @@ def test_phone_number_matching():
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0] assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
def test_parallel_search_consistency():
"""Test that the same search executed 3 times in parallel returns consistent results"""
def search_task():
return scrape_property(
location="Phoenix, AZ",
listing_type="for_sale",
limit=100
)
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(search_task) for _ in range(3)]
results = [future.result() for future in as_completed(futures)]
# Verify all results are valid
assert all([result is not None for result in results])
assert all([isinstance(result, pd.DataFrame) for result in results])
assert all([len(result) > 0 for result in results])
# Verify all results have the same length (primary consistency check)
lengths = [len(result) for result in results]
assert len(set(lengths)) == 1, \
f"All parallel searches should return same number of results, got lengths: {lengths}"
def test_return_type(): def test_return_type():
results = { results = {
"pandas": [scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100)], "pandas": [scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100)],