From 905cfcae2c50cbbd98ce1fa22816acdf7e97bdf5 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 17 Sep 2023 18:52:34 -0500 Subject: [PATCH] refactor: scrape_property() --- homeharvest/__init__.py | 134 ++++++++++--------- homeharvest/core/scrapers/models.py | 4 +- homeharvest/core/scrapers/redfin/__init__.py | 11 +- homeharvest/core/scrapers/zillow/__init__.py | 5 +- 4 files changed, 87 insertions(+), 67 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 52dfb9b..f817806 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -15,11 +15,7 @@ _scrapers = { } -def scrape_property( - location: str, - site_name: str, - listing_type: str = "for_sale", #: for_sale, for_rent, sold -) -> Union[list[Building], list[Property]]: +def validate_input(site_name: str, listing_type: str) -> None: if site_name.lower() not in _scrapers: raise InvalidSite(f"Provided site, '{site_name}', does not exist.") @@ -28,6 +24,75 @@ def scrape_property( f"Provided listing type, '{listing_type}', does not exist." ) + +def get_ordered_properties(result: Union[Building, Property]) -> list[str]: + if isinstance(result, Property): + return [ + "listing_type", + "address_one", + "city", + "state", + "zip_code", + "address_two", + "url", + "property_type", + "price", + "beds", + "baths", + "square_feet", + "price_per_square_foot", + "lot_size", + "stories", + "year_built", + "agent_name", + "mls_id", + "description", + ] + elif isinstance(result, Building): + return [ + "address_one", + "city", + "state", + "zip_code", + "address_two", + "url", + "num_units", + "min_unit_price", + "max_unit_price", + "avg_unit_price", + "listing_type", + ] + return [] + + +def process_result(result: Union[Building, Property]) -> pd.DataFrame: + prop_data = result.__dict__ + + address_data = prop_data["address"] + prop_data["site_name"] = prop_data["site_name"].value + prop_data["listing_type"] = prop_data["listing_type"].value + prop_data["property_type"] = prop_data["property_type"].value.lower() + prop_data["address_one"] = address_data.address_one + prop_data["city"] = address_data.city + prop_data["state"] = address_data.state + prop_data["zip_code"] = address_data.zip_code + prop_data["address_two"] = address_data.address_two + + del prop_data["address"] + + properties_df = pd.DataFrame([prop_data]) + properties_df = properties_df[get_ordered_properties(result)] + + return properties_df + + +def scrape_property( + location: str, + site_name: str, + listing_type: str = "for_sale", #: for_sale, for_rent, sold +) -> Union[list[Building], list[Property]]: + validate_input(site_name, listing_type) + scraper_input = ScraperInput( location=location, listing_type=ListingType[listing_type.upper()], @@ -37,63 +102,6 @@ def scrape_property( site = _scrapers[site_name.lower()](scraper_input) results = site.search() - properties_dfs = [] - - for result in results: - prop_data = result.__dict__ - - address_data = prop_data["address"] - prop_data["site_name"] = prop_data["site_name"].value - prop_data["listing_type"] = prop_data["listing_type"].value - prop_data["property_type"] = prop_data["property_type"].value.lower() - prop_data["address_one"] = address_data.address_one - prop_data["city"] = address_data.city - prop_data["state"] = address_data.state - prop_data["zip_code"] = address_data.zip_code - prop_data["address_two"] = address_data.address_two - - del prop_data["address"] - - if isinstance(result, Property): - desired_order = [ - "listing_type", - "address_one", - "city", - "state", - "zip_code", - "address_two", - "url", - "property_type", - "price", - "beds", - "baths", - "square_feet", - "price_per_square_foot", - "lot_size", - "stories", - "year_built", - "agent_name", - "mls_id", - "description", - ] - - elif isinstance(result, Building): - desired_order = [ - "address_one", - "city", - "state", - "zip_code", - "address_two", - "url", - "num_units", - "min_unit_price", - "max_unit_price", - "avg_unit_price", - "listing_type", - ] - - properties_df = pd.DataFrame([prop_data]) - properties_df = properties_df[desired_order] - properties_dfs.append(properties_df) + properties_dfs = [process_result(result) for result in results] return pd.concat(properties_dfs, ignore_index=True) diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 9da1d30..1a3db97 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -17,9 +17,11 @@ class ListingType(Enum): class PropertyType(Enum): HOUSE = "HOUSE" CONDO = "CONDO" - TOWNHOUSE = "townhousE" + TOWNHOUSE = "TOWNHOUSE" SINGLE_FAMILY = "SINGLE_FAMILY" MULTI_FAMILY = "MULTI_FAMILY" + MANUFACTURED = "MANUFACTURED" + APARTMENT = "APARTMENT" LAND = "LAND" OTHER = "OTHER" diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index 3f08f20..29855a7 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -55,6 +55,15 @@ class RedfinScraper(Scraper): ) url = "https://www.redfin.com{}".format(home["url"]) property_type = home["propertyType"] if "propertyType" in home else None + lot_size_data = home.get("lotSize") + if not isinstance(lot_size_data, int): + lot_size = ( + lot_size_data.get("value", None) + if isinstance(lot_size_data, dict) + else None + ) + else: + lot_size = lot_size_data return Property( site_name=self.site_name, @@ -70,7 +79,7 @@ class RedfinScraper(Scraper): if not single_search else home["yearBuilt"], square_feet=get_value("sqFt"), - lot_size=home.get("lotSize", {}).get("value", None), + lot_size=lot_size, property_type=PropertyType.from_int_code(home.get("propertyType")), price_per_square_foot=get_value("pricePerSqFt"), price=get_value("price"), diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index cc41826..1b25c16 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -97,7 +97,9 @@ class ZillowScraper(Scraper): else property_data["hdpUrl"] ) address_data = property_data["address"] - address_one, address_two = cls._parse_address_two(address_data["streetAddress"]) + address_one, address_two = self._parse_address_two( + address_data["streetAddress"] + ) address = Address( address_one=address_one, address_two=address_two, @@ -106,7 +108,6 @@ class ZillowScraper(Scraper): zip_code=address_data["zipcode"], ) property_type = property_data.get("homeType", None) - print(property_type) return Property( site_name=self.site_name,