From cc76e067b23873725baee47acb3753db1b79d021 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 20:01:55 -0500 Subject: [PATCH 1/3] fix: lat/long KeyError --- README.md | 2 +- homeharvest/core/scrapers/realtor/__init__.py | 4 ++-- homeharvest/core/scrapers/redfin/__init__.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1790f38..5395dbd 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ properties.to_csv('props.csv', index=False) ## Output ```py >>> properties.head() - street city ... mls_id description + street city ... mls_id description 0 420 N Scottsdale Rd Tempe ... NaN NaN 1 1255 E University Dr Tempe ... NaN NaN 2 1979 E Rio Salado Pkwy Tempe ... NaN NaN diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 0ee873e..f6cd68d 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -249,8 +249,8 @@ class RealtorScraper(Scraper): unit=parse_unit(result["location"]["address"]["unit"]), country="USA", ), - latitude=result["location"]["address"]["coordinate"]["lat"], - longitude=result["location"]["address"]["coordinate"]["lon"], + latitude=result["location"]["address"]["coordinate"]["lat"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lat" in result["location"]["address"]["coordinate"] else None, + longitude=result["location"]["address"]["coordinate"]["lon"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lon" in result["location"]["address"]["coordinate"] else None, site_name=self.site_name, property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"], diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index a3e9e18..d701ff2 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -94,8 +94,8 @@ class RedfinScraper(Scraper): price_per_sqft=get_value("pricePerSqFt"), price=get_value("price"), mls_id=get_value("mlsId"), - latitude=home["latLong"]["latitude"] if "latLong" in home else None, - longitude=home["latLong"]["longitude"] if "latLong" in home else None, + latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None, + longitude = home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None ) def _parse_building(self, building: dict) -> Property: From b01162161d6664b5a20c8737f30ee2d40c128208 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 20:09:28 -0500 Subject: [PATCH 2/3] chore: merge --- HomeHarvest_Demo.ipynb | 49 ++++++++++++++++++++++++++++++++++++++++-- README.md | 6 +++--- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/HomeHarvest_Demo.ipynb b/HomeHarvest_Demo.ipynb index fc0dceb..a9e8f12 100644 --- a/HomeHarvest_Demo.ipynb +++ b/HomeHarvest_Demo.ipynb @@ -31,11 +31,33 @@ "metadata": {}, "outputs": [], "source": [ + "# scrapes all 3 sites by default\n", "scrape_property(\n", - " location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n", + " location=\"dallas\",\n", + " listing_type=\"for_sale\"\n", ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaf86093", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# search a specific address\n", + "scrape_property(\n", + " location=\"2530 Al Lipscomb Way\",\n", + " site_name=\"zillow\",\n", + " listing_type=\"for_sale\"\n", + ")," + ] + }, { "cell_type": "code", "execution_count": null, @@ -43,8 +65,31 @@ "metadata": {}, "outputs": [], "source": [ + "# check rentals\n", "scrape_property(\n", - " location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n", + " location=\"chicago\",\n", + " site_name=[\"redfin\", \"realtor.com\"],\n", + " listing_type=\"for_rent\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af280cd3", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# check sold properties\n", + "scrape_property(\n", + " location=\"chicago, illinois\",\n", + " site_name=[\"redfin\"],\n", + " listing_type=\"sold\"\n", ")" ] } diff --git a/README.md b/README.md index d7a9285..7d1e66f 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,8 @@ print(properties) ``` ## Output ```py ->>> properties.head() - street city ... mls_id description +>> properties.head() + street city ... mls_id description 0 420 N Scottsdale Rd Tempe ... NaN NaN 1 1255 E University Dr Tempe ... NaN NaN 2 1979 E Rio Salado Pkwy Tempe ... NaN NaN @@ -118,7 +118,7 @@ The following exceptions may be raised when using HomeHarvest: - `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `NoResultsFound` - no properties found from your input -- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the location you input +- `GeoCoordsNotFound` - if Zillow scraper is not able to find the geo-coordinates from the `location` ## Frequently Asked Questions From 2eec389838d3179dd863d90697e1165a69bf1346 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 21:02:12 -0500 Subject: [PATCH 3/3] docs: add logo --- README.md | 14 ++++++-------- homeharvest/__init__.py | 7 +++---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 7d1e66f..757f074 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -# HomeHarvest + -**HomeHarvest** is a simple but comprehensive real estate scraping library. +**HomeHarvest** is a simple, yet comprehensive, real estate scraping library. [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo) - *Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.* ## Features + - Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously - Aggregates the properties in a Pandas DataFrame @@ -32,13 +32,11 @@ properties: pd.DataFrame = scrape_property( #: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel(). print(properties) - - ``` ## Output ```py ->> properties.head() - street city ... mls_id description +>>> properties.head() + street city ... mls_id description 0 420 N Scottsdale Rd Tempe ... NaN NaN 1 1255 E University Dr Tempe ... NaN NaN 2 1979 E Rio Salado Pkwy Tempe ... NaN NaN @@ -118,7 +116,7 @@ The following exceptions may be raised when using HomeHarvest: - `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `NoResultsFound` - no properties found from your input -- `GeoCoordsNotFound` - if Zillow scraper is not able to find the geo-coordinates from the `location` +- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the location you input ## Frequently Asked Questions diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index ca6d7eb..e2f7f2a 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -17,7 +17,6 @@ _scrapers = { "zillow": ZillowScraper, } - def validate_input(site_name: str, listing_type: str) -> None: if site_name.lower() not in _scrapers: raise InvalidSite(f"Provided site, '{site_name}', does not exist.") @@ -27,7 +26,6 @@ def validate_input(site_name: str, listing_type: str) -> None: f"Provided listing type, '{listing_type}', does not exist." ) - def get_ordered_properties(result: Property) -> list[str]: return [ "property_url", @@ -67,7 +65,6 @@ def get_ordered_properties(result: Property) -> list[str]: "longitude", ] - def process_result(result: Property) -> pd.DataFrame: prop_data = result.__dict__ @@ -93,7 +90,6 @@ def process_result(result: Property) -> pd.DataFrame: return properties_df - def _scrape_single_site( location: str, site_name: str, listing_type: str ) -> pd.DataFrame: @@ -112,6 +108,7 @@ def _scrape_single_site( results = site.search() properties_dfs = [process_result(result) for result in results] + properties_dfs = [df.dropna(axis=1, how='all') for df in properties_dfs if not df.empty] if not properties_dfs: return pd.DataFrame() @@ -154,6 +151,8 @@ def scrape_property( result = future.result() results.append(result) + results = [df for df in results if not df.empty and not df.isna().all().all()] + if not results: return pd.DataFrame()