- return type parameter

- optimized get extra fields with query clustering
2026-03-05 03:54:29 -08:00 · 2025-04-12 17:55:52 -07:00
parent 65f799a27d
commit 8a5683fe79
8 changed files with 177 additions and 42 deletions
--- a/README.md
+++ b/README.md
@@ -83,7 +83,12 @@ Optional
    - 'farm'
    - 'land'
    - 'mobile'
-
+│
+├── return_type (option): Choose the return type.
+│    - 'pandas' (default)
+│    - 'pydantic'
+│    - 'raw' (json)
+│
 ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
 │    Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
 │
--- a/homeharvest/init.py
+++ b/homeharvest/init.py
@@ -3,12 +3,13 @@ import pandas as pd
 from .core.scrapers import ScraperInput
 from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
 from .core.scrapers.realtor import RealtorScraper
-from .core.scrapers.models import ListingType, SearchPropertyType
+from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property


 def scrape_property(
    location: str,
    listing_type: str = "for_sale",
+    return_type: str = "pandas",
    property_type: list[str] | None = None,
    radius: float = None,
    mls_only: bool = False,
@@ -19,12 +20,13 @@ def scrape_property(
    foreclosure: bool = None,
    extra_property_data: bool = True,
    exclude_pending: bool = False,
-    limit: int = 10000,
-) -> pd.DataFrame:
+    limit: int = 10000
+) -> pd.DataFrame | list[dict] | list[Property]:
    """
    Scrape properties from Realtor.com based on a given location and listing type.
    :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
    :param listing_type: Listing Type (for_sale, for_rent, sold, pending)
+    :param return_type: Return type (pandas, pydantic, raw)
    :param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
    :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
    :param mls_only: If set, fetches only listings with MLS IDs.
@@ -42,7 +44,8 @@ def scrape_property(

    scraper_input = ScraperInput(
        location=location,
-        listing_type=ListingType[listing_type.upper()],
+        listing_type=ListingType(listing_type.upper()),
+        return_type=ReturnType(return_type.lower()),
        property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
        proxy=proxy,
        radius=radius,
@@ -59,6 +62,9 @@ def scrape_property(
    site = RealtorScraper(scraper_input)
    results = site.search()

+    if scraper_input.return_type != ReturnType.pandas:
+        return results
+
    properties_dfs = [df for result in results if not (df := process_result(result)).empty]
    if not properties_dfs:
        return pd.DataFrame()
--- a/homeharvest/core/scrapers/init.py
+++ b/homeharvest/core/scrapers/init.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 from dataclasses import dataclass
+from typing import Union
+
 import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 import uuid
 from ...exceptions import AuthenticationError
-from .models import Property, ListingType, SiteName, SearchPropertyType
+from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
 import json


@@ -24,6 +26,7 @@ class ScraperInput:
    extra_property_data: bool | None = True
    exclude_pending: bool | None = False
    limit: int = 10000
+    return_type: ReturnType = ReturnType.pandas


 class Scraper:
@@ -81,8 +84,9 @@ class Scraper:
        self.extra_property_data = scraper_input.extra_property_data
        self.exclude_pending = scraper_input.exclude_pending
        self.limit = scraper_input.limit
+        self.return_type = scraper_input.return_type

-    def search(self) -> list[Property]: ...
+    def search(self) -> list[Union[Property | dict]]: ...

    @staticmethod
    def _parse_home(home) -> Property: ...
--- a/homeharvest/core/scrapers/models.py
+++ b/homeharvest/core/scrapers/models.py
@@ -4,6 +4,12 @@ from enum import Enum
 from typing import Optional


+class ReturnType(Enum):
+    pydantic = "pydantic"
+    pandas = "pandas"
+    raw = "raw"
+
+
 class SiteName(Enum):
    ZILLOW = "zillow"
    REDFIN = "redfin"
@@ -148,6 +154,9 @@ class Property:
    property_url: str

    property_id: str
+    #: allows_cats: bool
+    #: allows_dogs: bool
+
    listing_id: str | None = None

    mls: str | None = None
@@ -167,6 +176,8 @@ class Property:
    hoa_fee: int | None = None
    days_on_mls: int | None = None
    description: Description | None = None
+    tags: list[str] | None = None
+    details: list[dict] | None = None

    latitude: float | None = None
    longitude: float | None = None
--- a/homeharvest/core/scrapers/realtor/init.py
+++ b/homeharvest/core/scrapers/realtor/init.py
@@ -32,8 +32,9 @@ from ..models import (
    Builder,
    Advertisers,
    Office,
+    ReturnType
 )
-from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA
+from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT


 class RealtorScraper(Scraper):
@@ -120,7 +121,7 @@ class RealtorScraper(Scraper):

        property_info = response_json["data"]["home"]

-        return [self.process_property(property_info, "home")]
+        return [self.process_property(property_info)]

    @staticmethod
    def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None:
@@ -168,7 +169,7 @@ class RealtorScraper(Scraper):

        return processed_advertisers

-    def process_property(self, result: dict, query_name: str) -> Property | None:
+    def process_property(self, result: dict) -> Property | None:
        mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None

        if not mls and self.mls_only:
@@ -188,9 +189,7 @@ class RealtorScraper(Scraper):
            return

        property_id = result["property_id"]
-        prop_details = self.get_prop_details(property_id) if self.extra_property_data and query_name != "home" else {}
-        if not prop_details:
-            prop_details = self.process_extra_property_details(result)
+        prop_details = self.process_extra_property_details(result) if self.extra_property_data else {}

        property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues")
        estimated_value = self.get_key(property_estimates_root, [0, "estimate"])
@@ -233,7 +232,7 @@ class RealtorScraper(Scraper):
        )
        return realty_property

-    def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]:
+    def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]:
        """
        Handles a location area & returns a list of properties
        """
@@ -324,6 +323,7 @@ class RealtorScraper(Scraper):
                                        %s
                                        %s
                                    }
+                                    bucket: { sort: "fractal_v1.1.3_fr" }
                                    %s
                                    limit: 200
                                    offset: $offset
@@ -363,7 +363,7 @@ class RealtorScraper(Scraper):
        response_json = response.json()
        search_key = "home_search" if "home_search" in query else "property_search"

-        properties: list[Property] = []
+        properties: list[Union[Property, dict]] = []

        if (
            response_json is None
@@ -381,15 +381,25 @@ class RealtorScraper(Scraper):

        #: limit the number of properties to be processed
        #: example, if your offset is 200, and your limit is 250, return 50
-        properties_list = properties_list[: self.limit - offset]
+        properties_list: list[dict] = properties_list[: self.limit - offset]

-        with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
-            futures = [executor.submit(self.process_property, result, search_key) for result in properties_list]
+        if self.extra_property_data:
+            property_ids = [data["property_id"] for data in properties_list]
+            extra_property_details = self.get_bulk_prop_details(property_ids) or {}

-            for future in as_completed(futures):
-                result = future.result()
-                if result:
-                    properties.append(result)
+            for result in properties_list:
+                result.update(extra_property_details.get(result["property_id"], {}))
+
+        if self.return_type != ReturnType.raw:
+            with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
+                futures = [executor.submit(self.process_property, result) for result in properties_list]
+
+                for future in as_completed(futures):
+                    result = future.result()
+                    if result:
+                        properties.append(result)
+        else:
+            properties = properties_list

        return {
            "total": total_properties,
@@ -520,28 +530,35 @@ class RealtorScraper(Scraper):
        wait=wait_exponential(min=4, max=10),
        stop=stop_after_attempt(3),
    )
-    def get_prop_details(self, property_id: str) -> dict:
-        if not self.extra_property_data:
+    def get_bulk_prop_details(self, property_ids: list[str]) -> dict:
+        """
+        Fetch extra property details for multiple properties in a single GraphQL query.
+        Returns a map of property_id to its details.
+        """
+        if not self.extra_property_data or not property_ids:
            return {}

-        query = """query GetHome($property_id: ID!) {
-                    home(property_id: $property_id) {
-                        __typename
+        property_ids = list(set(property_ids))

-                        nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
-                            __typename schools { district { __typename id name } }
-                        }
-                        taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
-                    }
-                }"""
-
-        variables = {"property_id": property_id}
-        response = self.session.post(self.SEARCH_GQL_URL, json={"query": query, "variables": variables})
+        # Construct the bulk query
+        fragments = "\n".join(
+            f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}'
+            for property_id in property_ids
+        )
+        query = f"""{HOME_FRAGMENT}
+        
+        query GetHomes {{
+            {fragments}
+        }}"""

+        response = self.session.post(self.SEARCH_GQL_URL, json={"query": query})
        data = response.json()
-        property_details = data["data"]["home"]

-        return self.process_extra_property_details(property_details)
+        if "data" not in data:
+            return {}
+
+        properties = data["data"]
+        return {data.replace('home_', ''): properties[data] for data in properties if properties[data]}

    @staticmethod
    def _parse_neighborhoods(result: dict) -> Optional[str]:
--- a/homeharvest/core/scrapers/realtor/queries.py
+++ b/homeharvest/core/scrapers/realtor/queries.py
@@ -11,6 +11,34 @@ _SEARCH_HOMES_DATA_BASE = """{
    list_price_max
    list_price_min
    price_per_sqft
+    tags
+    details {
+        category
+        text
+        parent_category
+    }
+    pet_policy {
+        cats
+        dogs
+        dogs_small
+        dogs_large
+        __typename
+    }
+    units {
+        availability {
+          date
+          __typename
+        }
+        description {
+          baths_consolidated
+          baths
+          beds
+          sqft
+          __typename
+        }
+        list_price
+        __typename
+    }
    flags {
        is_contingent
        is_pending
@@ -64,11 +92,14 @@ _SEARCH_HOMES_DATA_BASE = """{
    tax_record {
        public_record_id
    }
-    primary_photo {
+    primary_photo(https: true) {
        href
    }
-    photos {
+    photos(https: true) {
        href
+        tags {
+            label
+        }
    }
    advertisers {
        email
@@ -116,15 +147,63 @@ _SEARCH_HOMES_DATA_BASE = """{
        }
        rental_management {
            name
+            href
            fulfillment_id
        }
    }
    """

+
+HOME_FRAGMENT = """
+fragment HomeData on Home {
+    property_id
+    nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
+        __typename schools { district { __typename id name } }
+    }
+    taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
+    monthly_fees {
+        description
+        display_amount
+    }
+    one_time_fees {
+        description
+        display_amount
+    }
+    parking {
+        unassigned_space_rent
+        assigned_spaces_available
+        description
+        assigned_space_rent
+    }
+    terms {
+        text
+        category
+    }
+}
+"""
+
 HOMES_DATA = """%s
                nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
                            __typename schools { district { __typename id name } }
                        }
+                monthly_fees {
+                    description
+                    display_amount
+                }
+                one_time_fees {
+                    description
+                    display_amount
+                }
+                parking {
+                    unassigned_space_rent
+                    assigned_spaces_available
+                    description
+                    assigned_space_rent
+                }
+                terms {
+                    text
+                    category
+                }
                taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
                estimates {
                    __typename
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "homeharvest"
-version = "0.4.5"
+version = "0.4.6"
 description = "Real estate scraping library"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/HomeHarvest"
--- a/tests/test_realtor.py
+++ b/tests/test_realtor.py
@@ -1,4 +1,5 @@
-from homeharvest import scrape_property
+from homeharvest import scrape_property, Property
+import pandas as pd


 def test_realtor_pending_or_contingent():
@@ -287,3 +288,15 @@ def test_phone_number_matching():

    #: assert phone numbers are the same
    assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
+
+
+def test_return_type():
+    results = {
+        "pandas": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100),
+        "pydantic": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="pydantic"),
+        "raw": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="raw"),
+    }
+
+    assert isinstance(results["pandas"], pd.DataFrame)
+    assert isinstance(results["pydantic"][0], Property)
+    assert isinstance(results["raw"][0], dict)