Merge pull request #2 from ZacharyHampton/all_3_sites

feat: run all 3 sites with one call
2023-09-18 15:17:50 -07:00 · 2023-09-18 15:17:50 -07:00 · d0a6a66b6a
parent fe351ab57c 8e140a0e45
commit d0a6a66b6a
4 changed files with 80 additions and 40 deletions
--- a/homeharvest/init.py
+++ b/homeharvest/init.py
@ -1,11 +1,14 @@
+import pandas as pd
+from typing import Union
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+
+from .core.scrapers import ScraperInput
 from .core.scrapers.redfin import RedfinScraper
 from .core.scrapers.realtor import RealtorScraper
 from .core.scrapers.zillow import ZillowScraper
 from .core.scrapers.models import ListingType, Property, SiteName
-from .core.scrapers import ScraperInput
 from .exceptions import InvalidSite, InvalidListingType
-from typing import Union
-import pandas as pd


 _scrapers = {
@ -91,21 +94,12 @@ def process_result(result: Property) -> pd.DataFrame:
    return properties_df


-def scrape_property(
-    location: str,
-    site_name: str,
-    listing_type: str = "for_sale",  #: for_sale, for_rent, sold
+def _scrape_single_site(
+    location: str, site_name: str, listing_type: str
 ) -> pd.DataFrame:
    """
-    Scrape property from various sites from a given location and listing type.
-
-    :returns: pd.DataFrame
-    :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
-    :param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
-    :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
-    :return: pd.DataFrame containing properties
+    Helper function to scrape a single site.
    """
-
    validate_input(site_name, listing_type)

    scraper_input = ScraperInput(
@ -122,3 +116,46 @@ def scrape_property(
        return pd.DataFrame()

    return pd.concat(properties_dfs, ignore_index=True)
+
+
+def scrape_property(
+    location: str,
+    site_name: Union[str, list[str]] = list(_scrapers.keys()),
+    listing_type: str = "for_sale",
+) -> pd.DataFrame:
+    """
+    Scrape property from various sites from a given location and listing type.
+
+    :returns: pd.DataFrame
+    :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
+    :param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
+    :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
+    :return: pd.DataFrame containing properties
+    """
+    if site_name is None:
+        site_name = list(_scrapers.keys())
+
+    if not isinstance(site_name, list):
+        site_name = [site_name]
+
+    if len(site_name) == 1:
+        final_df = _scrape_single_site(location, site_name[0], listing_type)
+        final_df = final_df.drop_duplicates(subset="street_address", keep="first")
+        return final_df
+
+    results = []
+    with ThreadPoolExecutor() as executor:
+        futures = {
+            executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
+            for s_name in site_name
+        }
+
+        for future in concurrent.futures.as_completed(futures):
+            result = future.result()
+            results.append(result)
+
+    if not results:
+        return pd.DataFrame()
+    final_df = pd.concat(results, ignore_index=True)
+    final_df = final_df.drop_duplicates(subset="street_address", keep="first")
+    return final_df
--- a/homeharvest/core/scrapers/realtor/init.py
+++ b/homeharvest/core/scrapers/realtor/init.py
@ -240,7 +240,7 @@ class RealtorScraper(Scraper):
                    city=result["location"]["address"]["city"],
                    state=result["location"]["address"]["state_code"],
                    zip_code=result["location"]["address"]["postal_code"],
-                    unit=result["location"]["address"]["unit"],
+                    unit=parse_address_two(result["location"]["address"]["unit"]),
                    country="USA",
                ),
                site_name=self.site_name,
--- a/homeharvest/core/scrapers/zillow/init.py
+++ b/homeharvest/core/scrapers/zillow/init.py
@ -130,7 +130,9 @@ class ZillowScraper(Scraper):
                home_info = result["hdpData"]["homeInfo"]
                address_data = {
                    "street_address": home_info["streetAddress"],
-                    "unit": home_info.get("unit"),
+                    "unit": parse_address_two(home_info["unit"])
+                    if "unit" in home_info
+                    else None,
                    "city": home_info["city"],
                    "state": home_info["state"],
                    "zip_code": home_info["zipcode"],
@ -213,22 +215,6 @@ class ZillowScraper(Scraper):

        return properties_list

-    def _extract_units(self, result: dict):
-        units = {}
-        if "units" in result:
-            num_units = result.get("availabilityCount", len(result["units"]))
-            prices = [
-                int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
-                for unit in result["units"]
-            ]
-            units["apt_availability_count"] = num_units
-            units["apt_min_unit_price"] = min(prices)
-            units["apt_max_unit_price"] = max(prices)
-            units["apt_avg_unit_price"] = (
-                sum(prices) // num_units if num_units else None
-            )
-        return units
-
    def _get_single_property_page(self, property_data: dict):
        """
        This method is used when a user enters the exact location & zillow returns just one property
@ -239,10 +225,9 @@ class ZillowScraper(Scraper):
            else property_data["hdpUrl"]
        )
        address_data = property_data["address"]
-        unit = parse_address_two(address_data["streetAddress"])
        address = Address(
            street_address=address_data["streetAddress"],
-            unit=unit,
+            unit=parse_address_two(address_data["streetAddress"]),
            city=address_data["city"],
            state=address_data["state"],
            zip_code=address_data["zipcode"],
@ -301,11 +286,10 @@ class ZillowScraper(Scraper):
        else:
            raise ValueError(f"Unexpected state/zip format in address: {address_str}")

-        unit = parse_address_two(street_address)
        return Address(
            street_address=street_address,
            city=city,
-            unit=unit,
+            unit=parse_address_two(street_address),
            state=state,
            zip_code=zip_code,
            country="USA",
--- a/homeharvest/utils.py
+++ b/homeharvest/utils.py
@ -1,6 +1,25 @@
 import re


-def parse_address_two(address_one: str):
-    apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
-    return apt_match.group().strip() if apt_match else None
+def parse_address_two(street_address: str):
+    if not street_address:
+        return None
+    apt_match = re.search(
+        r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
+        street_address,
+        re.I,
+    )
+
+    if apt_match:
+        apt_str = apt_match.group().strip()
+        apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
+        return apt_str
+    else:
+        return None
+
+
+if __name__ == "__main__":
+    print(parse_address_two("810 E Colter St APT 32"))
+    print(parse_address_two("1234 Elm Street apt 2B"))
+    print(parse_address_two("1234 Elm Street UNIT 3A"))
+    print(parse_address_two("1234 Elm Street unit 3A"))