From 82092faa289247a461a975c448e7ec2147d9e761 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 19:35:38 -0500 Subject: [PATCH 1/5] docs: readme --- README.md | 155 ++++++++++++++++--- homeharvest/core/scrapers/zillow/__init__.py | 6 +- homeharvest/exceptions.py | 6 +- pyproject.toml | 2 +- 4 files changed, 142 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 17982cd..b4b3fc7 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,63 @@ # HomeHarvest -**HomeHarvest** aims to be the top Python real estate scraping library. - -_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._ +**HomeHarvest** is a simple but comprehensive real estate scraping library. [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo) + +*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.* +## Features + + +- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously +- Aggregates the properties in a Pandas DataFrame +- Proxy support (HTTP/S, SOCKS) + + ## Installation ```bash pip install --upgrade homeharvest ``` - -## Example Usage + _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ + +## Usage ```py ->>> from homeharvest import scrape_property -... properties = scrape_property( -... location="85281", site_name="zillow", listing_type="for_rent" -... ) +from homeharvest import scrape_property +import pandas as pd +properties: pd.DataFrame = scrape_property( + site_name=["zillow", "realtor.com", "redfin"], + location="85281", + listing_type="for_rent" # for_sale / sold + + # use if you want to use a proxy (3 types) + # proxy="socks5://homeharvest:5a4vpWtj8EeJ2hoYzk@us.smartproxy.com:20001", + # proxy="http://homeharvest:5a4vpWtj8EeJ2hoYzk@us.smartproxy.com:20001", + # proxy="https://homeharvest:5a4vpWtj8EeJ2hoYzk@us.smartproxy.com:20001", +) + +#1 output to .csv (simplest, then use Excel) +properties.to_csv('props.csv', index=False) + +#2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) + +# formatting for pandas +#pd.set_option('display.max_columns', None) +#pd.set_option('display.max_rows', None) +#pd.set_option('display.width', None) +#pd.set_option('display.max_colwidth', 50) # set to 0 to see full property_url / descr +#display(properties) + +#3 output to console +#print(properties) + + +``` +## Output +```py >>> properties.head() - address_one city ... mls_id description + street city ... mls_id description 0 420 N Scottsdale Rd Tempe ... NaN NaN 1 1255 E University Dr Tempe ... NaN NaN 2 1979 E Rio Salado Pkwy Tempe ... NaN NaN @@ -29,14 +66,96 @@ pip install --upgrade homeharvest [5 rows x 23 columns] ``` -### Site Name Options +### Parameters for `scrape_properties()` +```plaintext +Required +├── location (str): address in various formats e.g. just zip, full address, city/state, etc. +└── listing_type (enum): for_rent, for_sale, sold +Optional +├── site_name (List[enum], default=all three sites): zillow, realtor.com, redfin +├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] +``` -- `zillow` -- `redfin` -- `realtor.com` +### Property Schema +```plaintext +Property +├── Basic Information: +│ ├── property_url (str) +│ ├── site_name (enum): zillow, redfin, realtor.com +│ ├── listing_type (enum: ListingType) +│ └── property_type (enum): house, apartment, condo, townhouse, single_family, multi_family, building -### Listing Types +├── Address Details: +│ ├── street_address (str) +│ ├── city (str) +│ ├── state (str) +│ ├── zip_code (str) +│ ├── unit (str) +│ └── country (str) + +├── Property Features: +│ ├── price (int) +│ ├── tax_assessed_value (int) +│ ├── currency (str) +│ ├── square_feet (int) +│ ├── beds (int) +│ ├── baths (float) +│ ├── lot_area_value (float) +│ ├── lot_area_unit (str) +│ ├── stories (int) +│ └── year_built (int) + +├── Miscellaneous Details: +│ ├── price_per_sqft (int) +│ ├── mls_id (str) +│ ├── agent_name (str) +│ ├── img_src (str) +│ ├── description (str) +│ ├── status_text (str) +│ ├── latitude (float) +│ ├── longitude (float) +│ └── posted_time (str) + +├── Building Details (for property_type: building): +│ ├── bldg_name (str) +│ ├── bldg_unit_count (int) +│ ├── bldg_min_beds (int) +│ ├── bldg_min_baths (float) +│ └── bldg_min_area (int) + +└── Apartment Details (for property type: apartment): + └── apt_min_price (int) +``` +## Supported Countries for Property Scraping + +* **Zillow**: contains listings in the **US** & **Canada** +* **Realtor.com**: mainly from the **US** but also has international listings +* **Redfin**: listings mainly in the **US**, **Canada**, & has expanded to some areas in **Mexico** + +### Exceptions +The following exceptions may be raised when using HomeHarvest: + +- `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com` +- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` +- `NoResultsFound` - no properties found from your input +- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the locaion you input + + + +## Frequently Asked Questions + +--- + +**Q: Encountering issues with your queries?** +**A:** Try a single site and/or broadening the location. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues). + +--- + +**Q: Received a Forbidden 403 response code?** +**A:** This indicates that you have been blocked by the real estate site for sending too many requests. Currently, **Zillow** is particularly aggressive with blocking. We recommend: + +- Waiting a few seconds between requests. +- Trying a VPN or proxy to change your IP address. + +--- -- `for_rent` -- `for_sale` -- `sold` diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 9eaa546..3cfb5cb 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -2,7 +2,7 @@ import re import json from .. import Scraper from ....utils import parse_address_two, parse_unit -from ....exceptions import NoResultsFound, PropertyNotFound +from ....exceptions import GeoCoordsNotFound from ..models import Property, Address, ListingType, PropertyType, SiteName @@ -45,7 +45,7 @@ class ZillowScraper(Scraper): return self._fetch_properties_backend(coords) else: - raise BoxBoundsNotFound("Box bounds could not be located.") + raise GeoCoordsNotFound("Box bounds could not be located.") elif "gdpClientCache" in data["props"]["pageProps"]: gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"]) @@ -55,7 +55,7 @@ class ZillowScraper(Scraper): property = self._get_single_property_page(property_data) return [property] - raise PropertyNotFound("Specific property data not found in the response.") + raise NoResultsFound("Specific property data not found in the response.") def _fetch_properties_backend(self, coords): url = "https://www.zillow.com/async-create-search-page-state" diff --git a/homeharvest/exceptions.py b/homeharvest/exceptions.py index 299e02b..cd18640 100644 --- a/homeharvest/exceptions.py +++ b/homeharvest/exceptions.py @@ -10,9 +10,5 @@ class NoResultsFound(Exception): """Raised when no results are found for the given location""" -class PropertyNotFound(Exception): - """Raised when no property is found for the given address""" - - -class BoxBoundsNotFound(Exception): +class GeoCoordsNotFound(Exception): """Raised when no property is found for the given address""" diff --git a/pyproject.toml b/pyproject.toml index 403ba1a..33eae81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.1.4" +version = "0.2.0" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest" From fbbd56d930ba862b76abe4b7a0ded1b7835368fc Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 19:39:22 -0500 Subject: [PATCH 2/5] docs: remove proxy usage --- README.md | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/README.md b/README.md index b4b3fc7..5fd500a 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,6 @@ - Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously - Aggregates the properties in a Pandas DataFrame -- Proxy support (HTTP/S, SOCKS) - ## Installation @@ -30,11 +28,6 @@ properties: pd.DataFrame = scrape_property( site_name=["zillow", "realtor.com", "redfin"], location="85281", listing_type="for_rent" # for_sale / sold - - # use if you want to use a proxy (3 types) - # proxy="socks5://homeharvest:5a4vpWtj8EeJ2hoYzk@us.smartproxy.com:20001", - # proxy="http://homeharvest:5a4vpWtj8EeJ2hoYzk@us.smartproxy.com:20001", - # proxy="https://homeharvest:5a4vpWtj8EeJ2hoYzk@us.smartproxy.com:20001", ) #1 output to .csv (simplest, then use Excel) @@ -73,7 +66,6 @@ Required └── listing_type (enum): for_rent, for_sale, sold Optional ├── site_name (List[enum], default=all three sites): zillow, realtor.com, redfin -├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] ``` ### Property Schema @@ -140,8 +132,6 @@ The following exceptions may be raised when using HomeHarvest: - `NoResultsFound` - no properties found from your input - `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the locaion you input - - ## Frequently Asked Questions --- @@ -155,7 +145,7 @@ The following exceptions may be raised when using HomeHarvest: **A:** This indicates that you have been blocked by the real estate site for sending too many requests. Currently, **Zillow** is particularly aggressive with blocking. We recommend: - Waiting a few seconds between requests. -- Trying a VPN or proxy to change your IP address. +- Trying a VPN to change your IP address. --- From 0621b01d9ad83c3a75c638765769747108edfa70 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 18 Sep 2023 19:40:49 -0500 Subject: [PATCH 3/5] docs: readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5fd500a..1790f38 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ Property │ ├── status_text (str) │ ├── latitude (float) │ ├── longitude (float) -│ └── posted_time (str) +│ └── posted_time (str) [Only for Zillow] ├── Building Details (for property_type: building): │ ├── bldg_name (str) From ae3961514b63fb6504e75b7e08f140ddd153d54d Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 17:45:14 -0700 Subject: [PATCH 4/5] Update README.md --- README.md | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 5fd500a..42764ff 100644 --- a/README.md +++ b/README.md @@ -30,20 +30,8 @@ properties: pd.DataFrame = scrape_property( listing_type="for_rent" # for_sale / sold ) -#1 output to .csv (simplest, then use Excel) -properties.to_csv('props.csv', index=False) - -#2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) - -# formatting for pandas -#pd.set_option('display.max_columns', None) -#pd.set_option('display.max_rows', None) -#pd.set_option('display.width', None) -#pd.set_option('display.max_colwidth', 50) # set to 0 to see full property_url / descr -#display(properties) - -#3 output to console -#print(properties) +#: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel(). +print(properties) ``` @@ -106,7 +94,7 @@ Property │ ├── status_text (str) │ ├── latitude (float) │ ├── longitude (float) -│ └── posted_time (str) +│ └── posted_time (str) [Only for Zillow] ├── Building Details (for property_type: building): │ ├── bldg_name (str) @@ -130,14 +118,14 @@ The following exceptions may be raised when using HomeHarvest: - `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `NoResultsFound` - no properties found from your input -- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the locaion you input +- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the location you input ## Frequently Asked Questions --- **Q: Encountering issues with your queries?** -**A:** Try a single site and/or broadening the location. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues). +**A:** Try a single site and/or broaden the location. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues). --- From a1684f87db7ea55999f8546289bf08d3446c42b8 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 17:46:58 -0700 Subject: [PATCH 5/5] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 33eae81..fa1d1b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "homeharvest" version = "0.2.0" -description = "Real estate scraping library" +description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest" readme = "README.md"