commit
7297f0eb33
|
@ -31,11 +31,33 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# scrapes all 3 sites by default\n",
|
||||||
"scrape_property(\n",
|
"scrape_property(\n",
|
||||||
" location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n",
|
" location=\"dallas\",\n",
|
||||||
|
" listing_type=\"for_sale\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "aaf86093",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# search a specific address\n",
|
||||||
|
"scrape_property(\n",
|
||||||
|
" location=\"2530 Al Lipscomb Way\",\n",
|
||||||
|
" site_name=\"zillow\",\n",
|
||||||
|
" listing_type=\"for_sale\"\n",
|
||||||
|
"),"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
@ -43,8 +65,31 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# check rentals\n",
|
||||||
"scrape_property(\n",
|
"scrape_property(\n",
|
||||||
" location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n",
|
" location=\"chicago\",\n",
|
||||||
|
" site_name=[\"redfin\", \"realtor.com\"],\n",
|
||||||
|
" listing_type=\"for_rent\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "af280cd3",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# check sold properties\n",
|
||||||
|
"scrape_property(\n",
|
||||||
|
" location=\"chicago, illinois\",\n",
|
||||||
|
" site_name=[\"redfin\"],\n",
|
||||||
|
" listing_type=\"sold\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
# HomeHarvest
|
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
|
||||||
|
|
||||||
**HomeHarvest** is a simple but comprehensive real estate scraping library.
|
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
|
||||||
|
|
||||||
[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
||||||
|
|
||||||
|
|
||||||
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
||||||
- Aggregates the properties in a Pandas DataFrame
|
- Aggregates the properties in a Pandas DataFrame
|
||||||
|
|
||||||
|
@ -32,8 +32,6 @@ properties: pd.DataFrame = scrape_property(
|
||||||
|
|
||||||
#: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel().
|
#: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel().
|
||||||
print(properties)
|
print(properties)
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
## Output
|
## Output
|
||||||
```py
|
```py
|
||||||
|
|
|
@ -17,7 +17,6 @@ _scrapers = {
|
||||||
"zillow": ZillowScraper,
|
"zillow": ZillowScraper,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def validate_input(site_name: str, listing_type: str) -> None:
|
def validate_input(site_name: str, listing_type: str) -> None:
|
||||||
if site_name.lower() not in _scrapers:
|
if site_name.lower() not in _scrapers:
|
||||||
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
||||||
|
@ -27,7 +26,6 @@ def validate_input(site_name: str, listing_type: str) -> None:
|
||||||
f"Provided listing type, '{listing_type}', does not exist."
|
f"Provided listing type, '{listing_type}', does not exist."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_ordered_properties(result: Property) -> list[str]:
|
def get_ordered_properties(result: Property) -> list[str]:
|
||||||
return [
|
return [
|
||||||
"property_url",
|
"property_url",
|
||||||
|
@ -67,7 +65,6 @@ def get_ordered_properties(result: Property) -> list[str]:
|
||||||
"longitude",
|
"longitude",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def process_result(result: Property) -> pd.DataFrame:
|
def process_result(result: Property) -> pd.DataFrame:
|
||||||
prop_data = result.__dict__
|
prop_data = result.__dict__
|
||||||
|
|
||||||
|
@ -93,7 +90,6 @@ def process_result(result: Property) -> pd.DataFrame:
|
||||||
|
|
||||||
return properties_df
|
return properties_df
|
||||||
|
|
||||||
|
|
||||||
def _scrape_single_site(
|
def _scrape_single_site(
|
||||||
location: str, site_name: str, listing_type: str
|
location: str, site_name: str, listing_type: str
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
|
@ -112,6 +108,7 @@ def _scrape_single_site(
|
||||||
results = site.search()
|
results = site.search()
|
||||||
|
|
||||||
properties_dfs = [process_result(result) for result in results]
|
properties_dfs = [process_result(result) for result in results]
|
||||||
|
properties_dfs = [df.dropna(axis=1, how='all') for df in properties_dfs if not df.empty]
|
||||||
if not properties_dfs:
|
if not properties_dfs:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
@ -154,6 +151,8 @@ def scrape_property(
|
||||||
result = future.result()
|
result = future.result()
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
|
results = [df for df in results if not df.empty and not df.isna().all().all()]
|
||||||
|
|
||||||
if not results:
|
if not results:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
|
@ -249,8 +249,8 @@ class RealtorScraper(Scraper):
|
||||||
unit=parse_unit(result["location"]["address"]["unit"]),
|
unit=parse_unit(result["location"]["address"]["unit"]),
|
||||||
country="USA",
|
country="USA",
|
||||||
),
|
),
|
||||||
latitude=result["location"]["address"]["coordinate"]["lat"],
|
latitude=result["location"]["address"]["coordinate"]["lat"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lat" in result["location"]["address"]["coordinate"] else None,
|
||||||
longitude=result["location"]["address"]["coordinate"]["lon"],
|
longitude=result["location"]["address"]["coordinate"]["lon"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lon" in result["location"]["address"]["coordinate"] else None,
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
+ result["property_id"],
|
+ result["property_id"],
|
||||||
|
|
|
@ -94,8 +94,8 @@ class RedfinScraper(Scraper):
|
||||||
price_per_sqft=get_value("pricePerSqFt"),
|
price_per_sqft=get_value("pricePerSqFt"),
|
||||||
price=get_value("price"),
|
price=get_value("price"),
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
latitude=home["latLong"]["latitude"] if "latLong" in home else None,
|
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
|
||||||
longitude=home["latLong"]["longitude"] if "latLong" in home else None,
|
longitude = home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_building(self, building: dict) -> Property:
|
def _parse_building(self, building: dict) -> Property:
|
||||||
|
|
Loading…
Reference in New Issue