diff --git a/HomeHarvest_Demo.ipynb b/HomeHarvest_Demo.ipynb index fc0dceb..a9e8f12 100644 --- a/HomeHarvest_Demo.ipynb +++ b/HomeHarvest_Demo.ipynb @@ -31,11 +31,33 @@ "metadata": {}, "outputs": [], "source": [ + "# scrapes all 3 sites by default\n", "scrape_property(\n", - " location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n", + " location=\"dallas\",\n", + " listing_type=\"for_sale\"\n", ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaf86093", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# search a specific address\n", + "scrape_property(\n", + " location=\"2530 Al Lipscomb Way\",\n", + " site_name=\"zillow\",\n", + " listing_type=\"for_sale\"\n", + ")," + ] + }, { "cell_type": "code", "execution_count": null, @@ -43,8 +65,31 @@ "metadata": {}, "outputs": [], "source": [ + "# check rentals\n", "scrape_property(\n", - " location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n", + " location=\"chicago\",\n", + " site_name=[\"redfin\", \"realtor.com\"],\n", + " listing_type=\"for_rent\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af280cd3", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# check sold properties\n", + "scrape_property(\n", + " location=\"chicago, illinois\",\n", + " site_name=[\"redfin\"],\n", + " listing_type=\"sold\"\n", ")" ] } diff --git a/README.md b/README.md index 42764ff..757f074 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -# HomeHarvest + -**HomeHarvest** is a simple but comprehensive real estate scraping library. +**HomeHarvest** is a simple, yet comprehensive, real estate scraping library. [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo) - *Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.* ## Features + - Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously - Aggregates the properties in a Pandas DataFrame @@ -32,8 +32,6 @@ properties: pd.DataFrame = scrape_property( #: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel(). print(properties) - - ``` ## Output ```py diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index ca6d7eb..e2f7f2a 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -17,7 +17,6 @@ _scrapers = { "zillow": ZillowScraper, } - def validate_input(site_name: str, listing_type: str) -> None: if site_name.lower() not in _scrapers: raise InvalidSite(f"Provided site, '{site_name}', does not exist.") @@ -27,7 +26,6 @@ def validate_input(site_name: str, listing_type: str) -> None: f"Provided listing type, '{listing_type}', does not exist." ) - def get_ordered_properties(result: Property) -> list[str]: return [ "property_url", @@ -67,7 +65,6 @@ def get_ordered_properties(result: Property) -> list[str]: "longitude", ] - def process_result(result: Property) -> pd.DataFrame: prop_data = result.__dict__ @@ -93,7 +90,6 @@ def process_result(result: Property) -> pd.DataFrame: return properties_df - def _scrape_single_site( location: str, site_name: str, listing_type: str ) -> pd.DataFrame: @@ -112,6 +108,7 @@ def _scrape_single_site( results = site.search() properties_dfs = [process_result(result) for result in results] + properties_dfs = [df.dropna(axis=1, how='all') for df in properties_dfs if not df.empty] if not properties_dfs: return pd.DataFrame() @@ -154,6 +151,8 @@ def scrape_property( result = future.result() results.append(result) + results = [df for df in results if not df.empty and not df.isna().all().all()] + if not results: return pd.DataFrame() diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 0ee873e..f6cd68d 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -249,8 +249,8 @@ class RealtorScraper(Scraper): unit=parse_unit(result["location"]["address"]["unit"]), country="USA", ), - latitude=result["location"]["address"]["coordinate"]["lat"], - longitude=result["location"]["address"]["coordinate"]["lon"], + latitude=result["location"]["address"]["coordinate"]["lat"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lat" in result["location"]["address"]["coordinate"] else None, + longitude=result["location"]["address"]["coordinate"]["lon"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lon" in result["location"]["address"]["coordinate"] else None, site_name=self.site_name, property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"], diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index a3e9e18..d701ff2 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -94,8 +94,8 @@ class RedfinScraper(Scraper): price_per_sqft=get_value("pricePerSqFt"), price=get_value("price"), mls_id=get_value("mlsId"), - latitude=home["latLong"]["latitude"] if "latLong" in home else None, - longitude=home["latLong"]["longitude"] if "latLong" in home else None, + latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None, + longitude = home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None ) def _parse_building(self, building: dict) -> Property: