From ca260fd2b41a61033816136100d52c0e4f67d796 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 17:42:16 -0500 Subject: [PATCH] fix: filter dup on street, unit, city --- homeharvest/__init__.py | 8 +++- homeharvest/core/scrapers/realtor/__init__.py | 12 +++--- homeharvest/core/scrapers/redfin/__init__.py | 40 +++++++++++-------- homeharvest/core/scrapers/zillow/__init__.py | 14 ++++--- homeharvest/utils.py | 26 +++++++++++- tests/test_realtor.py | 2 +- tests/test_redfin.py | 2 +- tests/test_zillow.py | 2 +- 8 files changed, 71 insertions(+), 35 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index c64b609..ba671f7 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -140,7 +140,9 @@ def scrape_property( if len(site_name) == 1: final_df = _scrape_single_site(location, site_name[0], listing_type) - final_df = final_df.drop_duplicates(subset="street_address", keep="first") + final_df = final_df.drop_duplicates( + subset=["street_address", "city", "unit"], keep="first" + ) return final_df results = [] @@ -157,5 +159,7 @@ def scrape_property( if not results: return pd.DataFrame() final_df = pd.concat(results, ignore_index=True) - final_df = final_df.drop_duplicates(subset="street_address", keep="first") + final_df = final_df.drop_duplicates( + subset=["street_address", "city", "unit"], keep="first" + ) return final_df diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index eccfec8..5298c3a 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -3,7 +3,7 @@ from ..models import Property, Address from .. import Scraper from typing import Any, Generator from ....exceptions import NoResultsFound -from ....utils import parse_address_two +from ....utils import parse_address_two, parse_unit from concurrent.futures import ThreadPoolExecutor, as_completed @@ -108,8 +108,7 @@ class RealtorScraper(Scraper): response_json = response.json() property_info = response_json["data"]["property"] - street_address = property_info["address"]["line"] - unit = parse_address_two(street_address) + street_address, unit = parse_address_two(property_info["address"]["line"]) return [ Property( @@ -234,13 +233,16 @@ class RealtorScraper(Scraper): return [] for result in response_json["data"]["home_search"]["results"]: + street_address, unit = parse_address_two( + result["location"]["address"]["line"] + ) realty_property = Property( address=Address( - street_address=result["location"]["address"]["line"], + street_address=street_address, city=result["location"]["address"]["city"], state=result["location"]["address"]["state_code"], zip_code=result["location"]["address"]["postal_code"], - unit=parse_address_two(result["location"]["address"]["unit"]), + unit=parse_unit(result["location"]["address"]["unit"]), country="USA", ), site_name=self.site_name, diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index 5350460..392855a 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -1,7 +1,7 @@ import json from typing import Any from .. import Scraper -from ....utils import parse_address_two +from ....utils import parse_address_two, parse_unit from ..models import Property, Address, PropertyType @@ -39,9 +39,10 @@ class RedfinScraper(Scraper): return home[key]["value"] if not single_search: - unit = parse_address_two(get_value("streetLine")) + street_address, unit = parse_address_two(get_value("streetLine")) + unit = parse_unit(get_value("streetLine")) address = Address( - street_address=get_value("streetLine"), + street_address=street_address, city=home["city"], state=home["state"], zip_code=home["zip"], @@ -50,10 +51,11 @@ class RedfinScraper(Scraper): ) else: address_info = home["streetAddress"] + street_address, unit = parse_address_two(address_info["assembledAddress"]) unit = parse_address_two(address_info["assembledAddress"]) address = Address( - street_address=address_info["assembledAddress"], + street_address=street_address, city=home["city"], state=home["state"], zip_code=home["zip"], @@ -94,26 +96,30 @@ class RedfinScraper(Scraper): ) def _parse_building(self, building: dict) -> Property: + street_address = " ".join( + [ + building["address"]["streetNumber"], + building["address"]["directionalPrefix"], + building["address"]["streetName"], + building["address"]["streetType"], + ] + ) + street_address, unit = parse_address_two(street_address) return Property( site_name=self.site_name, property_type=PropertyType("BUILDING"), address=Address( - street_address=" ".join( - [ - building["address"]["streetNumber"], - building["address"]["directionalPrefix"], - building["address"]["streetName"], - building["address"]["streetType"], - ] - ), + street_address=street_address, city=building["address"]["city"], state=building["address"]["stateOrProvinceCode"], zip_code=building["address"]["postalCode"], - unit=" ".join( - [ - building["address"]["unitType"], - building["address"]["unitValue"], - ] + unit=parse_unit( + " ".join( + [ + building["address"]["unitType"], + building["address"]["unitValue"], + ] + ) ), ), property_url="https://www.redfin.com{}".format(building["url"]), diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 4e55e4b..9eaa546 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -1,7 +1,7 @@ import re import json from .. import Scraper -from ....utils import parse_address_two +from ....utils import parse_address_two, parse_unit from ....exceptions import NoResultsFound, PropertyNotFound from ..models import Property, Address, ListingType, PropertyType, SiteName @@ -129,8 +129,8 @@ class ZillowScraper(Scraper): if "hdpData" in result: home_info = result["hdpData"]["homeInfo"] address_data = { - "street_address": home_info["streetAddress"], - "unit": parse_address_two(home_info["unit"]) + "street_address": parse_address_two(home_info["streetAddress"])[0], + "unit": parse_unit(home_info["unit"]) if "unit" in home_info else None, "city": home_info["city"], @@ -225,9 +225,10 @@ class ZillowScraper(Scraper): else property_data["hdpUrl"] ) address_data = property_data["address"] + street_address, unit = parse_address_two(address_data["streetAddress"]) address = Address( - street_address=address_data["streetAddress"], - unit=parse_address_two(address_data["streetAddress"]), + street_address=street_address, + unit=unit, city=address_data["city"], state=address_data["state"], zip_code=address_data["zipcode"], @@ -286,10 +287,11 @@ class ZillowScraper(Scraper): else: raise ValueError(f"Unexpected state/zip format in address: {address_str}") + street_address, unit = parse_address_two(street_address) return Address( street_address=street_address, city=city, - unit=parse_address_two(street_address), + unit=unit, state=state, zip_code=zip_code, country="USA", diff --git a/homeharvest/utils.py b/homeharvest/utils.py index bb88d9b..dea27d5 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -1,7 +1,29 @@ import re -def parse_address_two(street_address: str): +def parse_address_two(street_address: str) -> tuple: + if not street_address: + return street_address, None + + apt_match = re.search( + r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", + street_address, + re.I, + ) + + if apt_match: + apt_str = apt_match.group().strip() + cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I) + + main_address = street_address.replace( + apt_str, "" + ).strip() # Remove the matched part from the original address + return main_address, cleaned_apt_str + else: + return street_address, None + + +def parse_unit(street_address: str): if not street_address: return None apt_match = re.search( @@ -19,7 +41,7 @@ def parse_address_two(street_address: str): if __name__ == "__main__": - print(parse_address_two("810 E Colter St APT 32")) + print(parse_address_two("4303 E Cactus Rd Apt 126")) print(parse_address_two("1234 Elm Street apt 2B")) print(parse_address_two("1234 Elm Street UNIT 3A")) print(parse_address_two("1234 Elm Street unit 3A")) diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 24d5281..b9525c9 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -9,7 +9,7 @@ def test_realtor(): listing_type="for_sale", ), scrape_property( - location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent" + location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent" ), #: does not support "city, state, USA" format scrape_property( location="Dallas, TX", site_name="realtor.com", listing_type="sold" diff --git a/tests/test_redfin.py b/tests/test_redfin.py index 575d1b4..bc7912a 100644 --- a/tests/test_redfin.py +++ b/tests/test_redfin.py @@ -7,7 +7,7 @@ def test_redfin(): location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale" ), scrape_property( - location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent" + location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent" ), scrape_property( location="Dallas, TX, USA", site_name="redfin", listing_type="sold" diff --git a/tests/test_zillow.py b/tests/test_zillow.py index 38c3114..b99b6d6 100644 --- a/tests/test_zillow.py +++ b/tests/test_zillow.py @@ -7,7 +7,7 @@ def test_zillow(): location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale" ), scrape_property( - location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent" + location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent" ), scrape_property( location="Dallas, TX, USA", site_name="zillow", listing_type="sold"