From c7a4bfd5e4096f2ebb9b635e95431a73354ce8df Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@cullenwatson.com>
Date: Mon, 18 Sep 2023 16:18:22 -0500
Subject: [PATCH 1/3] feat: run all 3 sites with one scrape_property() call

---
 homeharvest/__init__.py | 65 +++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py
index 27ea8cb..1b13037 100644
--- a/homeharvest/__init__.py
+++ b/homeharvest/__init__.py
@@ -1,11 +1,14 @@
+import pandas as pd
+from typing import Union
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+
+from .core.scrapers import ScraperInput
 from .core.scrapers.redfin import RedfinScraper
 from .core.scrapers.realtor import RealtorScraper
 from .core.scrapers.zillow import ZillowScraper
 from .core.scrapers.models import ListingType, Property, SiteName
-from .core.scrapers import ScraperInput
 from .exceptions import InvalidSite, InvalidListingType
-from typing import Union
-import pandas as pd
 
 
 _scrapers = {
@@ -91,21 +94,12 @@ def process_result(result: Property) -> pd.DataFrame:
     return properties_df
 
 
-def scrape_property(
-    location: str,
-    site_name: str,
-    listing_type: str = "for_sale",  #: for_sale, for_rent, sold
+def _scrape_single_site(
+    location: str, site_name: str, listing_type: str
 ) -> pd.DataFrame:
     """
-    Scrape property from various sites from a given location and listing type.
-
-    :returns: pd.DataFrame
-    :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
-    :param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
-    :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
-    :return: pd.DataFrame containing properties
+    Helper function to scrape a single site.
     """
-
     validate_input(site_name, listing_type)
 
     scraper_input = ScraperInput(
@@ -122,3 +116,44 @@ def scrape_property(
         return pd.DataFrame()
 
     return pd.concat(properties_dfs, ignore_index=True)
+
+
+def scrape_property(
+    location: str,
+    site_name: Union[str, list[str]],
+    listing_type: str = "for_sale",
+) -> pd.DataFrame:
+    """
+    Scrape property from various sites from a given location and listing type.
+
+    :returns: pd.DataFrame
+    :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
+    :param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
+    :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
+    :return: pd.DataFrame containing properties
+    """
+    if site_name is None:
+        site_name = list(_scrapers.keys())
+
+    if not isinstance(site_name, list):
+        site_name = [site_name]
+
+    if len(site_name) == 1:
+        return _scrape_single_site(location, site_name[0], listing_type)
+
+    results = []
+    with ThreadPoolExecutor() as executor:
+        futures = {
+            executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
+            for s_name in site_name
+        }
+
+        for future in concurrent.futures.as_completed(futures):
+            result = future.result()
+            results.append(result)
+
+    if not results:
+        return pd.DataFrame()
+    final_df = pd.concat(results, ignore_index=True)
+    final_df = final_df.drop_duplicates(subset="street_address", keep="first")
+    return final_df

From 588689c230e5548b03e447f5912f16d121810a6c Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@cullenwatson.com>
Date: Mon, 18 Sep 2023 17:04:34 -0500
Subject: [PATCH 2/3] fix: normalize unit num

---
 homeharvest/__init__.py                       |  6 +++--
 homeharvest/core/scrapers/realtor/__init__.py |  2 +-
 homeharvest/core/scrapers/zillow/__init__.py  | 24 +++----------------
 homeharvest/utils.py                          | 20 +++++++++++++---
 4 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py
index 1b13037..c64b609 100644
--- a/homeharvest/__init__.py
+++ b/homeharvest/__init__.py
@@ -120,7 +120,7 @@ def _scrape_single_site(
 
 def scrape_property(
     location: str,
-    site_name: Union[str, list[str]],
+    site_name: Union[str, list[str]] = list(_scrapers.keys()),
     listing_type: str = "for_sale",
 ) -> pd.DataFrame:
     """
@@ -139,7 +139,9 @@ def scrape_property(
         site_name = [site_name]
 
     if len(site_name) == 1:
-        return _scrape_single_site(location, site_name[0], listing_type)
+        final_df = _scrape_single_site(location, site_name[0], listing_type)
+        final_df = final_df.drop_duplicates(subset="street_address", keep="first")
+        return final_df
 
     results = []
     with ThreadPoolExecutor() as executor:
diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py
index f6490f6..eccfec8 100644
--- a/homeharvest/core/scrapers/realtor/__init__.py
+++ b/homeharvest/core/scrapers/realtor/__init__.py
@@ -240,7 +240,7 @@ class RealtorScraper(Scraper):
                     city=result["location"]["address"]["city"],
                     state=result["location"]["address"]["state_code"],
                     zip_code=result["location"]["address"]["postal_code"],
-                    unit=result["location"]["address"]["unit"],
+                    unit=parse_address_two(result["location"]["address"]["unit"]),
                     country="USA",
                 ),
                 site_name=self.site_name,
diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py
index 0b4d889..96c5fb9 100644
--- a/homeharvest/core/scrapers/zillow/__init__.py
+++ b/homeharvest/core/scrapers/zillow/__init__.py
@@ -130,7 +130,7 @@ class ZillowScraper(Scraper):
                 home_info = result["hdpData"]["homeInfo"]
                 address_data = {
                     "street_address": home_info["streetAddress"],
-                    "unit": home_info.get("unit"),
+                    "unit": parse_address_two(home_info['unit']) if 'unit' in home_info else None,
                     "city": home_info["city"],
                     "state": home_info["state"],
                     "zip_code": home_info["zipcode"],
@@ -213,22 +213,6 @@ class ZillowScraper(Scraper):
 
         return properties_list
 
-    def _extract_units(self, result: dict):
-        units = {}
-        if "units" in result:
-            num_units = result.get("availabilityCount", len(result["units"]))
-            prices = [
-                int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
-                for unit in result["units"]
-            ]
-            units["apt_availability_count"] = num_units
-            units["apt_min_unit_price"] = min(prices)
-            units["apt_max_unit_price"] = max(prices)
-            units["apt_avg_unit_price"] = (
-                sum(prices) // num_units if num_units else None
-            )
-        return units
-
     def _get_single_property_page(self, property_data: dict):
         """
         This method is used when a user enters the exact location & zillow returns just one property
@@ -239,10 +223,9 @@ class ZillowScraper(Scraper):
             else property_data["hdpUrl"]
         )
         address_data = property_data["address"]
-        unit = parse_address_two(address_data["streetAddress"])
         address = Address(
             street_address=address_data["streetAddress"],
-            unit=unit,
+            unit=parse_address_two(address_data["streetAddress"]),
             city=address_data["city"],
             state=address_data["state"],
             zip_code=address_data["zipcode"],
@@ -301,11 +284,10 @@ class ZillowScraper(Scraper):
         else:
             raise ValueError(f"Unexpected state/zip format in address: {address_str}")
 
-        unit = parse_address_two(street_address)
         return Address(
             street_address=street_address,
             city=city,
-            unit=unit,
+            unit=parse_address_two(street_address),
             state=state,
             zip_code=zip_code,
             country="USA",
diff --git a/homeharvest/utils.py b/homeharvest/utils.py
index a22cdcf..db85762 100644
--- a/homeharvest/utils.py
+++ b/homeharvest/utils.py
@@ -1,6 +1,20 @@
 import re
 
 
-def parse_address_two(address_one: str):
-    apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
-    return apt_match.group().strip() if apt_match else None
+def parse_address_two(street_address: str):
+    if not street_address:
+        return None
+    apt_match = re.search(r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", street_address, re.I)
+
+    if apt_match:
+        apt_str = apt_match.group().strip()
+        apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
+        return apt_str
+    else:
+        return None
+
+if __name__ == "__main__":
+    print(parse_address_two("810 E Colter St APT 32"))
+    print(parse_address_two("1234 Elm Street apt 2B"))
+    print(parse_address_two("1234 Elm Street UNIT 3A"))
+    print(parse_address_two("1234 Elm Street unit 3A"))

From 8e140a0e45fe4de1fb6cc9a4be5e54aff21dea39 Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@cullenwatson.com>
Date: Mon, 18 Sep 2023 17:04:54 -0500
Subject: [PATCH 3/3] chore: format

---
 homeharvest/core/scrapers/zillow/__init__.py | 4 +++-
 homeharvest/utils.py                         | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py
index 96c5fb9..4e55e4b 100644
--- a/homeharvest/core/scrapers/zillow/__init__.py
+++ b/homeharvest/core/scrapers/zillow/__init__.py
@@ -130,7 +130,9 @@ class ZillowScraper(Scraper):
                 home_info = result["hdpData"]["homeInfo"]
                 address_data = {
                     "street_address": home_info["streetAddress"],
-                    "unit": parse_address_two(home_info['unit']) if 'unit' in home_info else None,
+                    "unit": parse_address_two(home_info["unit"])
+                    if "unit" in home_info
+                    else None,
                     "city": home_info["city"],
                     "state": home_info["state"],
                     "zip_code": home_info["zipcode"],
diff --git a/homeharvest/utils.py b/homeharvest/utils.py
index db85762..bb88d9b 100644
--- a/homeharvest/utils.py
+++ b/homeharvest/utils.py
@@ -4,7 +4,11 @@ import re
 def parse_address_two(street_address: str):
     if not street_address:
         return None
-    apt_match = re.search(r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", street_address, re.I)
+    apt_match = re.search(
+        r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
+        street_address,
+        re.I,
+    )
 
     if apt_match:
         apt_str = apt_match.group().strip()
@@ -13,6 +17,7 @@ def parse_address_two(street_address: str):
     else:
         return None
 
+
 if __name__ == "__main__":
     print(parse_address_two("810 E Colter St APT 32"))
     print(parse_address_two("1234 Elm Street apt 2B"))