diff --git a/README.md b/README.md index 78a5c97..a38c8f1 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,12 @@ Optional - 'farm' - 'land' - 'mobile' - +│ +├── return_type (option): Choose the return type. +│ - 'pandas' (default) +│ - 'pydantic' +│ - 'raw' (json) +│ ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses. │ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored) │ diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 643e954..4333b58 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -3,12 +3,13 @@ import pandas as pd from .core.scrapers import ScraperInput from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit from .core.scrapers.realtor import RealtorScraper -from .core.scrapers.models import ListingType, SearchPropertyType +from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property def scrape_property( location: str, listing_type: str = "for_sale", + return_type: str = "pandas", property_type: list[str] | None = None, radius: float = None, mls_only: bool = False, @@ -19,12 +20,13 @@ def scrape_property( foreclosure: bool = None, extra_property_data: bool = True, exclude_pending: bool = False, - limit: int = 10000, -) -> pd.DataFrame: + limit: int = 10000 +) -> pd.DataFrame | list[dict] | list[Property]: """ Scrape properties from Realtor.com based on a given location and listing type. :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param listing_type: Listing Type (for_sale, for_rent, sold, pending) + :param return_type: Return type (pandas, pydantic, raw) :param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile) :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param mls_only: If set, fetches only listings with MLS IDs. @@ -42,7 +44,8 @@ def scrape_property( scraper_input = ScraperInput( location=location, - listing_type=ListingType[listing_type.upper()], + listing_type=ListingType(listing_type.upper()), + return_type=ReturnType(return_type.lower()), property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None, proxy=proxy, radius=radius, @@ -59,6 +62,9 @@ def scrape_property( site = RealtorScraper(scraper_input) results = site.search() + if scraper_input.return_type != ReturnType.pandas: + return results + properties_dfs = [df for result in results if not (df := process_result(result)).empty] if not properties_dfs: return pd.DataFrame() diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 1c68061..466bb34 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -1,11 +1,13 @@ from __future__ import annotations from dataclasses import dataclass +from typing import Union + import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry import uuid from ...exceptions import AuthenticationError -from .models import Property, ListingType, SiteName, SearchPropertyType +from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType import json @@ -24,6 +26,7 @@ class ScraperInput: extra_property_data: bool | None = True exclude_pending: bool | None = False limit: int = 10000 + return_type: ReturnType = ReturnType.pandas class Scraper: @@ -81,8 +84,9 @@ class Scraper: self.extra_property_data = scraper_input.extra_property_data self.exclude_pending = scraper_input.exclude_pending self.limit = scraper_input.limit + self.return_type = scraper_input.return_type - def search(self) -> list[Property]: ... + def search(self) -> list[Union[Property | dict]]: ... @staticmethod def _parse_home(home) -> Property: ... diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 622c165..3238436 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -4,6 +4,12 @@ from enum import Enum from typing import Optional +class ReturnType(Enum): + pydantic = "pydantic" + pandas = "pandas" + raw = "raw" + + class SiteName(Enum): ZILLOW = "zillow" REDFIN = "redfin" @@ -148,6 +154,9 @@ class Property: property_url: str property_id: str + #: allows_cats: bool + #: allows_dogs: bool + listing_id: str | None = None mls: str | None = None @@ -167,6 +176,8 @@ class Property: hoa_fee: int | None = None days_on_mls: int | None = None description: Description | None = None + tags: list[str] | None = None + details: list[dict] | None = None latitude: float | None = None longitude: float | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 9a58ad1..e0e5207 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -32,8 +32,9 @@ from ..models import ( Builder, Advertisers, Office, + ReturnType ) -from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA +from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT class RealtorScraper(Scraper): @@ -120,7 +121,7 @@ class RealtorScraper(Scraper): property_info = response_json["data"]["home"] - return [self.process_property(property_info, "home")] + return [self.process_property(property_info)] @staticmethod def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None: @@ -168,7 +169,7 @@ class RealtorScraper(Scraper): return processed_advertisers - def process_property(self, result: dict, query_name: str) -> Property | None: + def process_property(self, result: dict) -> Property | None: mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None if not mls and self.mls_only: @@ -188,9 +189,7 @@ class RealtorScraper(Scraper): return property_id = result["property_id"] - prop_details = self.get_prop_details(property_id) if self.extra_property_data and query_name != "home" else {} - if not prop_details: - prop_details = self.process_extra_property_details(result) + prop_details = self.process_extra_property_details(result) if self.extra_property_data else {} property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues") estimated_value = self.get_key(property_estimates_root, [0, "estimate"]) @@ -233,7 +232,7 @@ class RealtorScraper(Scraper): ) return realty_property - def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]: + def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]: """ Handles a location area & returns a list of properties """ @@ -324,6 +323,7 @@ class RealtorScraper(Scraper): %s %s } + bucket: { sort: "fractal_v1.1.3_fr" } %s limit: 200 offset: $offset @@ -363,7 +363,7 @@ class RealtorScraper(Scraper): response_json = response.json() search_key = "home_search" if "home_search" in query else "property_search" - properties: list[Property] = [] + properties: list[Union[Property, dict]] = [] if ( response_json is None @@ -381,15 +381,25 @@ class RealtorScraper(Scraper): #: limit the number of properties to be processed #: example, if your offset is 200, and your limit is 250, return 50 - properties_list = properties_list[: self.limit - offset] + properties_list: list[dict] = properties_list[: self.limit - offset] - with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: - futures = [executor.submit(self.process_property, result, search_key) for result in properties_list] + if self.extra_property_data: + property_ids = [data["property_id"] for data in properties_list] + extra_property_details = self.get_bulk_prop_details(property_ids) or {} - for future in as_completed(futures): - result = future.result() - if result: - properties.append(result) + for result in properties_list: + result.update(extra_property_details.get(result["property_id"], {})) + + if self.return_type != ReturnType.raw: + with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: + futures = [executor.submit(self.process_property, result) for result in properties_list] + + for future in as_completed(futures): + result = future.result() + if result: + properties.append(result) + else: + properties = properties_list return { "total": total_properties, @@ -520,28 +530,35 @@ class RealtorScraper(Scraper): wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3), ) - def get_prop_details(self, property_id: str) -> dict: - if not self.extra_property_data: + def get_bulk_prop_details(self, property_ids: list[str]) -> dict: + """ + Fetch extra property details for multiple properties in a single GraphQL query. + Returns a map of property_id to its details. + """ + if not self.extra_property_data or not property_ids: return {} - query = """query GetHome($property_id: ID!) { - home(property_id: $property_id) { - __typename + property_ids = list(set(property_ids)) - nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { - __typename schools { district { __typename id name } } - } - taxHistory: tax_history { __typename tax year assessment { __typename building land total } } - } - }""" - - variables = {"property_id": property_id} - response = self.session.post(self.SEARCH_GQL_URL, json={"query": query, "variables": variables}) + # Construct the bulk query + fragments = "\n".join( + f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}' + for property_id in property_ids + ) + query = f"""{HOME_FRAGMENT} + + query GetHomes {{ + {fragments} + }}""" + response = self.session.post(self.SEARCH_GQL_URL, json={"query": query}) data = response.json() - property_details = data["data"]["home"] - return self.process_extra_property_details(property_details) + if "data" not in data: + return {} + + properties = data["data"] + return {data.replace('home_', ''): properties[data] for data in properties if properties[data]} @staticmethod def _parse_neighborhoods(result: dict) -> Optional[str]: diff --git a/homeharvest/core/scrapers/realtor/queries.py b/homeharvest/core/scrapers/realtor/queries.py index 4df8e91..0e9de37 100644 --- a/homeharvest/core/scrapers/realtor/queries.py +++ b/homeharvest/core/scrapers/realtor/queries.py @@ -11,6 +11,34 @@ _SEARCH_HOMES_DATA_BASE = """{ list_price_max list_price_min price_per_sqft + tags + details { + category + text + parent_category + } + pet_policy { + cats + dogs + dogs_small + dogs_large + __typename + } + units { + availability { + date + __typename + } + description { + baths_consolidated + baths + beds + sqft + __typename + } + list_price + __typename + } flags { is_contingent is_pending @@ -64,11 +92,14 @@ _SEARCH_HOMES_DATA_BASE = """{ tax_record { public_record_id } - primary_photo { + primary_photo(https: true) { href } - photos { + photos(https: true) { href + tags { + label + } } advertisers { email @@ -116,15 +147,63 @@ _SEARCH_HOMES_DATA_BASE = """{ } rental_management { name + href fulfillment_id } } """ + +HOME_FRAGMENT = """ +fragment HomeData on Home { + property_id + nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { + __typename schools { district { __typename id name } } + } + taxHistory: tax_history { __typename tax year assessment { __typename building land total } } + monthly_fees { + description + display_amount + } + one_time_fees { + description + display_amount + } + parking { + unassigned_space_rent + assigned_spaces_available + description + assigned_space_rent + } + terms { + text + category + } +} +""" + HOMES_DATA = """%s nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { __typename schools { district { __typename id name } } } + monthly_fees { + description + display_amount + } + one_time_fees { + description + display_amount + } + parking { + unassigned_space_rent + assigned_spaces_available + description + assigned_space_rent + } + terms { + text + category + } taxHistory: tax_history { __typename tax year assessment { __typename building land total } } estimates { __typename diff --git a/pyproject.toml b/pyproject.toml index d797a3b..161c55f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.4.5" +version = "0.4.6" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index df2249e..82b49cd 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -1,4 +1,5 @@ -from homeharvest import scrape_property +from homeharvest import scrape_property, Property +import pandas as pd def test_realtor_pending_or_contingent(): @@ -287,3 +288,15 @@ def test_phone_number_matching(): #: assert phone numbers are the same assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0] + + +def test_return_type(): + results = { + "pandas": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100), + "pydantic": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="pydantic"), + "raw": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="raw"), + } + + assert isinstance(results["pandas"], pd.DataFrame) + assert isinstance(results["pydantic"][0], Property) + assert isinstance(results["raw"][0], dict)