diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index a0973ba..6f2c54f 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -13,9 +13,10 @@ def scrape_property( mls_only: bool = False, past_days: int = None, proxy: str = None, - date_from: str = None, + date_from: str = None, #: TODO: Switch to one parameter, Date, with date_from and date_to, pydantic validation date_to: str = None, foreclosure: bool = None, + extra_property_data: bool = True, ) -> pd.DataFrame: """ Scrape properties from Realtor.com based on a given location and listing type. @@ -23,9 +24,11 @@ def scrape_property( :param listing_type: Listing Type (for_sale, for_rent, sold) :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param mls_only: If set, fetches only listings with MLS IDs. + :param proxy: Proxy to use for scraping :param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days. :param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28 - :param proxy: Proxy to use for scraping + :param foreclosure: If set, fetches only foreclosure listings. + :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) """ validate_input(listing_type) validate_dates(date_from, date_to) @@ -51,4 +54,5 @@ def scrape_property( with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) - return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties] + + return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""}) diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 1a29fb8..b5758c2 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -76,10 +76,27 @@ class Description: text: str | None = None +@dataclass +class AgentPhone: #: For documentation purposes only (at the moment) + number: str | None = None + type: str | None = None + primary: bool | None = None + ext: str | None = None + + @dataclass class Agent: + name: str | None = None + phones: list[dict] | AgentPhone | None = None + email: str | None = None + href: str | None = None + + +@dataclass +class Broker: name: str | None = None phone: str | None = None + website: str | None = None @dataclass diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index dd02cef..3d238d0 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -651,26 +651,64 @@ class RealtorScraper(Scraper): return homes def get_prop_details(self, property_id: str) -> dict: - payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {{ __typename schools {{ district {{ __typename id name }} }} }} taxHistory: tax_history {{ __typename tax year assessment {{ __typename building land total }} }}estimates {{ __typename currentValues: current_values {{ __typename source {{ __typename type name }} estimate estimateHigh: estimate_high estimateLow: estimate_low date isBestHomeValue: isbest_homevalue }} }} }}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}' - response = self.session.post(self.PROPERTY_GQL, data=payload) + query = """query GetHome($property_id: ID!) { + home(property_id: $property_id) { + __typename + + advertisers { + __typename + type + name + email + phones { number type ext primary } + } + + + nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { + __typename schools { district { __typename id name } } + } + taxHistory: tax_history { __typename tax year assessment { __typename building land total } } + estimates { + __typename + currentValues: current_values { + __typename + source { __typename type name } + estimate + estimateHigh: estimate_high + estimateLow: estimate_low + date + isBestHomeValue: isbest_homevalue + } + } + } + }""" + + variables = {"property_id": property_id} + response = self.session.post(self.PROPERTY_GQL, json={"query": query, "variables": variables}) + data = response.json() def get_key(keys: list): try: - data = response.json() + value = data for key in keys: - data = data[key] - return data - except (KeyError, TypeError): + value = value[key] + + return value or {} + except (KeyError, TypeError, IndexError): return {} - ads = get_key(["data", "home", "consumerAdvertisers"]) + ads = get_key(["data", "home", "advertisers"]) schools = get_key(["data", "home", "nearbySchools", "schools"]) assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"]) estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"]) - agents = [Agent(name=ad["name"], phone=ad["phone"]) for ad in ads] + agents = [Agent( + name=ad["name"], + email=ad["email"], + phones=ad["phones"] + ) for ad in ads] - schools = [school["district"]["name"] for school in schools] + schools = [school["district"]["name"] for school in schools if school['district'].get('name')] return { "agents": agents if agents else None, "schools": schools if schools else None, @@ -698,7 +736,8 @@ class RealtorScraper(Scraper): return address_part - def _parse_address(self, result: dict, search_type): + @staticmethod + def _parse_address(result: dict, search_type): if search_type == "general_search": address = result["location"]["address"] else: @@ -706,12 +745,12 @@ class RealtorScraper(Scraper): return Address( street=" ".join( - [ - self.handle_none_safely(address.get("street_number")), - self.handle_none_safely(address.get("street_direction")), - self.handle_none_safely(address.get("street_name")), - self.handle_none_safely(address.get("street_suffix")), - ] + part for part in [ + address.get("street_number"), + address.get("street_direction"), + address.get("street_name"), + address.get("street_suffix"), + ] if part is not None ).strip(), unit=address["unit"], city=address["city"], @@ -746,7 +785,7 @@ class RealtorScraper(Scraper): baths_half=description_data.get("baths_half"), sqft=description_data.get("sqft"), lot_sqft=description_data.get("lot_sqft"), - sold_price=description_data.get("sold_price"), + sold_price=description_data.get("sold_price") if result.get('last_sold_date') or result["list_price"] != description_data.get("sold_price") else None, #: has a sold date or list and sold price are different year_built=description_data.get("year_built"), garage=description_data.get("garage"), stories=description_data.get("stories"), diff --git a/homeharvest/utils.py b/homeharvest/utils.py index d164b9a..da57ca9 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -1,6 +1,6 @@ import pandas as pd from datetime import datetime -from .core.scrapers.models import Property, ListingType +from .core.scrapers.models import Property, ListingType, Agent from .exceptions import InvalidListingType, InvalidDate ordered_properties = [ @@ -38,8 +38,8 @@ ordered_properties = [ "hoa_fee", "parking_garage", "agent", - "broker", - "broker_phone", + "agent_email", + "agent_phones", "nearby_schools", "primary_photo", "alt_photos", @@ -59,12 +59,11 @@ def process_result(result: Property) -> pd.DataFrame: prop_data["zip_code"] = address_data.zip if "agents" in prop_data: - agents = prop_data["agents"] + agents: list[Agent] | None = prop_data["agents"] if agents: prop_data["agent"] = agents[0].name - if len(agents) > 1: - prop_data["broker"] = agents[1].name - prop_data["broker_phone"] = agents[1].phone + prop_data["agent_email"] = agents[0].email + prop_data["agent_phones"] = agents[0].phones prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None @@ -107,5 +106,5 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None: if date_to_obj < date_from_obj: raise InvalidDate("date_to must be after date_from.") - except ValueError as e: + except ValueError: raise InvalidDate(f"Invalid date format or range") diff --git a/pyproject.toml b/pyproject.toml index b9d01d5..2406b30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.3.20" +version = "0.3.21" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest"