From 8311f4dfbc688e28e219722010331cad69842c9f Mon Sep 17 00:00:00 2001 From: Zachary Hampton Date: Tue, 15 Jul 2025 12:00:19 -0700 Subject: [PATCH] - data additions --- homeharvest/core/scrapers/models.py | 3 ++ homeharvest/core/scrapers/realtor/__init__.py | 9 +++- homeharvest/core/scrapers/realtor/queries.py | 46 +++++++++++++++++++ tests/test_realtor.py | 10 ++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 5ddc171..a6faba0 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -76,6 +76,7 @@ class PropertyType(Enum): @dataclass class Address: + formatted_address: str | None = None full_line: str | None = None street: str | None = None unit: str | None = None @@ -84,6 +85,8 @@ class Address: zip: str | None = None + + @dataclass class Description: primary_photo: str | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index a89fcfa..9cc3f07 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -391,7 +391,14 @@ class RealtorScraper(Scraper): extra_property_details = self.get_bulk_prop_details(property_ids) or {} for result in properties_list: - result.update(extra_property_details.get(result["property_id"], {})) + specific_details_for_property = extra_property_details.get(result["property_id"], {}) + + #: address is retrieved on both homes and search homes, so when merged, homes overrides, + # this gets the internal data we want and only updates that (migrate to a func if more fields) + result["location"].update(specific_details_for_property["location"]) + del specific_details_for_property["location"] + + result.update(specific_details_for_property) if self.return_type != ReturnType.raw: with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: diff --git a/homeharvest/core/scrapers/realtor/queries.py b/homeharvest/core/scrapers/realtor/queries.py index d3dad3c..e21c30f 100644 --- a/homeharvest/core/scrapers/realtor/queries.py +++ b/homeharvest/core/scrapers/realtor/queries.py @@ -3,8 +3,10 @@ _SEARCH_HOMES_DATA_BASE = """{ listing_id property_id href + permalink list_date status + mls_status last_sold_price last_sold_date list_price @@ -12,6 +14,15 @@ _SEARCH_HOMES_DATA_BASE = """{ list_price_min price_per_sqft tags + open_houses { + start_date + end_date + description + time_zone + dst + href + methods + } details { category text @@ -154,6 +165,7 @@ _SEARCH_HOMES_DATA_BASE = """{ } mls_set nrds_id + state_license rental_corporation { fulfillment_id } @@ -172,6 +184,23 @@ fragment HomeData on Home { nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { __typename schools { district { __typename id name } } } + popularity { + periods { + clicks_total + views_total + dwell_time_mean + dwell_time_median + leads_total + shares_total + saves_total + last_n_days + } + } + location { + parcel { + parcel_id + } + } taxHistory: tax_history { __typename tax year assessment { __typename building land total } } monthly_fees { description @@ -206,6 +235,23 @@ HOMES_DATA = """%s description display_amount } + popularity { + periods { + clicks_total + views_total + dwell_time_mean + dwell_time_median + leads_total + shares_total + saves_total + last_n_days + } + } + location { + parcel { + parcel_id + } + } parking { unassigned_space_rent assigned_spaces_available diff --git a/tests/test_realtor.py b/tests/test_realtor.py index c2bc713..673c3c1 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -303,3 +303,13 @@ def test_return_type(): assert all(isinstance(result, pd.DataFrame) for result in results["pandas"]) assert all(isinstance(result[0], Property) for result in results["pydantic"]) assert all(isinstance(result[0], dict) for result in results["raw"]) + + +def test_has_open_house(): + address_result = scrape_property("1 Hawthorne St Unit 12F, San Francisco, CA 94105", return_type="raw") + assert address_result[0]["open_houses"] is not None #: has open house data from address search + + zip_code_result = scrape_property("94105", return_type="raw") + address_from_zip_result = list(filter(lambda row: row["property_id"] == '1264014746', zip_code_result)) + + assert address_from_zip_result[0]["open_houses"] is not None #: has open house data from general search