From c7a4bfd5e4096f2ebb9b635e95431a73354ce8df Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 16:18:22 -0500 Subject: [PATCH 1/3] feat: run all 3 sites with one scrape_property() call --- homeharvest/__init__.py | 65 +++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 27ea8cb..1b13037 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,11 +1,14 @@ +import pandas as pd +from typing import Union +import concurrent.futures +from concurrent.futures import ThreadPoolExecutor + +from .core.scrapers import ScraperInput from .core.scrapers.redfin import RedfinScraper from .core.scrapers.realtor import RealtorScraper from .core.scrapers.zillow import ZillowScraper from .core.scrapers.models import ListingType, Property, SiteName -from .core.scrapers import ScraperInput from .exceptions import InvalidSite, InvalidListingType -from typing import Union -import pandas as pd _scrapers = { @@ -91,21 +94,12 @@ def process_result(result: Property) -> pd.DataFrame: return properties_df -def scrape_property( - location: str, - site_name: str, - listing_type: str = "for_sale", #: for_sale, for_rent, sold +def _scrape_single_site( + location: str, site_name: str, listing_type: str ) -> pd.DataFrame: """ - Scrape property from various sites from a given location and listing type. - - :returns: pd.DataFrame - :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') - :param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin') - :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold') - :return: pd.DataFrame containing properties + Helper function to scrape a single site. """ - validate_input(site_name, listing_type) scraper_input = ScraperInput( @@ -122,3 +116,44 @@ def scrape_property( return pd.DataFrame() return pd.concat(properties_dfs, ignore_index=True) + + +def scrape_property( + location: str, + site_name: Union[str, list[str]], + listing_type: str = "for_sale", +) -> pd.DataFrame: + """ + Scrape property from various sites from a given location and listing type. + + :returns: pd.DataFrame + :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') + :param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin') + :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold') + :return: pd.DataFrame containing properties + """ + if site_name is None: + site_name = list(_scrapers.keys()) + + if not isinstance(site_name, list): + site_name = [site_name] + + if len(site_name) == 1: + return _scrape_single_site(location, site_name[0], listing_type) + + results = [] + with ThreadPoolExecutor() as executor: + futures = { + executor.submit(_scrape_single_site, location, s_name, listing_type): s_name + for s_name in site_name + } + + for future in concurrent.futures.as_completed(futures): + result = future.result() + results.append(result) + + if not results: + return pd.DataFrame() + final_df = pd.concat(results, ignore_index=True) + final_df = final_df.drop_duplicates(subset="street_address", keep="first") + return final_df From 588689c230e5548b03e447f5912f16d121810a6c Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 17:04:34 -0500 Subject: [PATCH 2/3] fix: normalize unit num --- homeharvest/__init__.py | 6 +++-- homeharvest/core/scrapers/realtor/__init__.py | 2 +- homeharvest/core/scrapers/zillow/__init__.py | 24 +++---------------- homeharvest/utils.py | 20 +++++++++++++--- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 1b13037..c64b609 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -120,7 +120,7 @@ def _scrape_single_site( def scrape_property( location: str, - site_name: Union[str, list[str]], + site_name: Union[str, list[str]] = list(_scrapers.keys()), listing_type: str = "for_sale", ) -> pd.DataFrame: """ @@ -139,7 +139,9 @@ def scrape_property( site_name = [site_name] if len(site_name) == 1: - return _scrape_single_site(location, site_name[0], listing_type) + final_df = _scrape_single_site(location, site_name[0], listing_type) + final_df = final_df.drop_duplicates(subset="street_address", keep="first") + return final_df results = [] with ThreadPoolExecutor() as executor: diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index f6490f6..eccfec8 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -240,7 +240,7 @@ class RealtorScraper(Scraper): city=result["location"]["address"]["city"], state=result["location"]["address"]["state_code"], zip_code=result["location"]["address"]["postal_code"], - unit=result["location"]["address"]["unit"], + unit=parse_address_two(result["location"]["address"]["unit"]), country="USA", ), site_name=self.site_name, diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 0b4d889..96c5fb9 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -130,7 +130,7 @@ class ZillowScraper(Scraper): home_info = result["hdpData"]["homeInfo"] address_data = { "street_address": home_info["streetAddress"], - "unit": home_info.get("unit"), + "unit": parse_address_two(home_info['unit']) if 'unit' in home_info else None, "city": home_info["city"], "state": home_info["state"], "zip_code": home_info["zipcode"], @@ -213,22 +213,6 @@ class ZillowScraper(Scraper): return properties_list - def _extract_units(self, result: dict): - units = {} - if "units" in result: - num_units = result.get("availabilityCount", len(result["units"])) - prices = [ - int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) - for unit in result["units"] - ] - units["apt_availability_count"] = num_units - units["apt_min_unit_price"] = min(prices) - units["apt_max_unit_price"] = max(prices) - units["apt_avg_unit_price"] = ( - sum(prices) // num_units if num_units else None - ) - return units - def _get_single_property_page(self, property_data: dict): """ This method is used when a user enters the exact location & zillow returns just one property @@ -239,10 +223,9 @@ class ZillowScraper(Scraper): else property_data["hdpUrl"] ) address_data = property_data["address"] - unit = parse_address_two(address_data["streetAddress"]) address = Address( street_address=address_data["streetAddress"], - unit=unit, + unit=parse_address_two(address_data["streetAddress"]), city=address_data["city"], state=address_data["state"], zip_code=address_data["zipcode"], @@ -301,11 +284,10 @@ class ZillowScraper(Scraper): else: raise ValueError(f"Unexpected state/zip format in address: {address_str}") - unit = parse_address_two(street_address) return Address( street_address=street_address, city=city, - unit=unit, + unit=parse_address_two(street_address), state=state, zip_code=zip_code, country="USA", diff --git a/homeharvest/utils.py b/homeharvest/utils.py index a22cdcf..db85762 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -1,6 +1,20 @@ import re -def parse_address_two(address_one: str): - apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) - return apt_match.group().strip() if apt_match else None +def parse_address_two(street_address: str): + if not street_address: + return None + apt_match = re.search(r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", street_address, re.I) + + if apt_match: + apt_str = apt_match.group().strip() + apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I) + return apt_str + else: + return None + +if __name__ == "__main__": + print(parse_address_two("810 E Colter St APT 32")) + print(parse_address_two("1234 Elm Street apt 2B")) + print(parse_address_two("1234 Elm Street UNIT 3A")) + print(parse_address_two("1234 Elm Street unit 3A")) From 8e140a0e45fe4de1fb6cc9a4be5e54aff21dea39 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 17:04:54 -0500 Subject: [PATCH 3/3] chore: format --- homeharvest/core/scrapers/zillow/__init__.py | 4 +++- homeharvest/utils.py | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 96c5fb9..4e55e4b 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -130,7 +130,9 @@ class ZillowScraper(Scraper): home_info = result["hdpData"]["homeInfo"] address_data = { "street_address": home_info["streetAddress"], - "unit": parse_address_two(home_info['unit']) if 'unit' in home_info else None, + "unit": parse_address_two(home_info["unit"]) + if "unit" in home_info + else None, "city": home_info["city"], "state": home_info["state"], "zip_code": home_info["zipcode"], diff --git a/homeharvest/utils.py b/homeharvest/utils.py index db85762..bb88d9b 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -4,7 +4,11 @@ import re def parse_address_two(street_address: str): if not street_address: return None - apt_match = re.search(r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", street_address, re.I) + apt_match = re.search( + r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", + street_address, + re.I, + ) if apt_match: apt_str = apt_match.group().strip() @@ -13,6 +17,7 @@ def parse_address_two(street_address: str): else: return None + if __name__ == "__main__": print(parse_address_two("810 E Colter St APT 32")) print(parse_address_two("1234 Elm Street apt 2B"))