Merge pull request #6 from ZacharyHampton/tidy_up_readme

Minor fixes
pull/8/head
Zachary Hampton 2023-09-18 19:04:08 -07:00 committed by GitHub
commit 7297f0eb33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 57 additions and 15 deletions

View File

@ -31,11 +31,33 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# scrapes all 3 sites by default\n",
"scrape_property(\n", "scrape_property(\n",
" location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n", " location=\"dallas\",\n",
" listing_type=\"for_sale\"\n",
")" ")"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "aaf86093",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# search a specific address\n",
"scrape_property(\n",
" location=\"2530 Al Lipscomb Way\",\n",
" site_name=\"zillow\",\n",
" listing_type=\"for_sale\"\n",
"),"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -43,8 +65,31 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# check rentals\n",
"scrape_property(\n", "scrape_property(\n",
" location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n", " location=\"chicago\",\n",
" site_name=[\"redfin\", \"realtor.com\"],\n",
" listing_type=\"for_rent\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af280cd3",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# check sold properties\n",
"scrape_property(\n",
" location=\"chicago, illinois\",\n",
" site_name=[\"redfin\"],\n",
" listing_type=\"sold\"\n",
")" ")"
] ]
} }

View File

@ -1,14 +1,14 @@
# HomeHarvest <img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
**HomeHarvest** is a simple but comprehensive real estate scraping library. **HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo) [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.* *Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
## Features ## Features
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously - Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
- Aggregates the properties in a Pandas DataFrame - Aggregates the properties in a Pandas DataFrame
@ -32,8 +32,6 @@ properties: pd.DataFrame = scrape_property(
#: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel(). #: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel().
print(properties) print(properties)
``` ```
## Output ## Output
```py ```py

View File

@ -17,7 +17,6 @@ _scrapers = {
"zillow": ZillowScraper, "zillow": ZillowScraper,
} }
def validate_input(site_name: str, listing_type: str) -> None: def validate_input(site_name: str, listing_type: str) -> None:
if site_name.lower() not in _scrapers: if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.") raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
@ -27,7 +26,6 @@ def validate_input(site_name: str, listing_type: str) -> None:
f"Provided listing type, '{listing_type}', does not exist." f"Provided listing type, '{listing_type}', does not exist."
) )
def get_ordered_properties(result: Property) -> list[str]: def get_ordered_properties(result: Property) -> list[str]:
return [ return [
"property_url", "property_url",
@ -67,7 +65,6 @@ def get_ordered_properties(result: Property) -> list[str]:
"longitude", "longitude",
] ]
def process_result(result: Property) -> pd.DataFrame: def process_result(result: Property) -> pd.DataFrame:
prop_data = result.__dict__ prop_data = result.__dict__
@ -93,7 +90,6 @@ def process_result(result: Property) -> pd.DataFrame:
return properties_df return properties_df
def _scrape_single_site( def _scrape_single_site(
location: str, site_name: str, listing_type: str location: str, site_name: str, listing_type: str
) -> pd.DataFrame: ) -> pd.DataFrame:
@ -112,6 +108,7 @@ def _scrape_single_site(
results = site.search() results = site.search()
properties_dfs = [process_result(result) for result in results] properties_dfs = [process_result(result) for result in results]
properties_dfs = [df.dropna(axis=1, how='all') for df in properties_dfs if not df.empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
@ -154,6 +151,8 @@ def scrape_property(
result = future.result() result = future.result()
results.append(result) results.append(result)
results = [df for df in results if not df.empty and not df.isna().all().all()]
if not results: if not results:
return pd.DataFrame() return pd.DataFrame()

View File

@ -249,8 +249,8 @@ class RealtorScraper(Scraper):
unit=parse_unit(result["location"]["address"]["unit"]), unit=parse_unit(result["location"]["address"]["unit"]),
country="USA", country="USA",
), ),
latitude=result["location"]["address"]["coordinate"]["lat"], latitude=result["location"]["address"]["coordinate"]["lat"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lat" in result["location"]["address"]["coordinate"] else None,
longitude=result["location"]["address"]["coordinate"]["lon"], longitude=result["location"]["address"]["coordinate"]["lon"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lon" in result["location"]["address"]["coordinate"] else None,
site_name=self.site_name, site_name=self.site_name,
property_url="https://www.realtor.com/realestateandhomes-detail/" property_url="https://www.realtor.com/realestateandhomes-detail/"
+ result["property_id"], + result["property_id"],

View File

@ -94,8 +94,8 @@ class RedfinScraper(Scraper):
price_per_sqft=get_value("pricePerSqFt"), price_per_sqft=get_value("pricePerSqFt"),
price=get_value("price"), price=get_value("price"),
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
latitude=home["latLong"]["latitude"] if "latLong" in home else None, latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
longitude=home["latLong"]["longitude"] if "latLong" in home else None, longitude = home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None
) )
def _parse_building(self, building: dict) -> Property: def _parse_building(self, building: dict) -> Property: