mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-04 19:44:29 -08:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f37bfdeb8 | ||
|
|
48c2338276 | ||
|
|
f58a1f4a74 | ||
|
|
4cef926d7d | ||
|
|
e82eeaa59f | ||
|
|
644f16b25b | ||
|
|
e9ddc6df92 | ||
|
|
50fb1c391d |
60
README.md
60
README.md
@@ -1,5 +1,8 @@
|
||||
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
|
||||
|
||||
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
|
||||
|
||||
|
||||
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
|
||||
|
||||
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
||||
@@ -10,7 +13,7 @@
|
||||
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
||||
- Aggregates the properties in a Pandas DataFrame
|
||||
|
||||
[Video Guide for HomeHarvest](https://www.youtube.com/watch?v=HCoHoiJdWQY)
|
||||
[Video Guide for HomeHarvest](https://youtu.be/JnV7eR2Ve2o) - _updated for release v0.2.7_
|
||||
|
||||

|
||||
|
||||
@@ -37,6 +40,7 @@ By default:
|
||||
- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
|
||||
- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
|
||||
- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
|
||||
- Use `-k` or `--keep_duplicates` to keep duplicate properties based on address. If not provided, duplicates will be removed.
|
||||
### Python
|
||||
|
||||
```py
|
||||
@@ -71,8 +75,9 @@ Required
|
||||
├── location (str): address in various formats e.g. just zip, full address, city/state, etc.
|
||||
└── listing_type (enum): for_rent, for_sale, sold
|
||||
Optional
|
||||
├── site_name (List[enum], default=all three sites): zillow, realtor.com, redfin
|
||||
├── site_name (list[enum], default=all three sites): zillow, realtor.com, redfin
|
||||
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||
└── keep_duplicates (bool, default=False): whether to keep or remove duplicate properties based on address
|
||||
```
|
||||
|
||||
### Property Schema
|
||||
@@ -81,7 +86,7 @@ Property
|
||||
├── Basic Information:
|
||||
│ ├── property_url (str)
|
||||
│ ├── site_name (enum): zillow, redfin, realtor.com
|
||||
│ ├── listing_type (enum: ListingType)
|
||||
│ ├── listing_type (enum): for_sale, for_rent, sold
|
||||
│ └── property_type (enum): house, apartment, condo, townhouse, single_family, multi_family, building
|
||||
|
||||
├── Address Details:
|
||||
@@ -92,45 +97,38 @@ Property
|
||||
│ ├── unit (str)
|
||||
│ └── country (str)
|
||||
|
||||
├── Property Features:
|
||||
│ ├── price (int)
|
||||
├── House for Sale Features:
|
||||
│ ├── tax_assessed_value (int)
|
||||
│ ├── currency (str)
|
||||
│ ├── square_feet (int)
|
||||
│ ├── beds (int)
|
||||
│ ├── baths (float)
|
||||
│ ├── lot_area_value (float)
|
||||
│ ├── lot_area_unit (str)
|
||||
│ ├── stories (int)
|
||||
│ └── year_built (int)
|
||||
│ ├── year_built (int)
|
||||
│ └── price_per_sqft (int)
|
||||
|
||||
├── Building for Sale and Apartment Details:
|
||||
│ ├── bldg_name (str)
|
||||
│ ├── beds_min (int)
|
||||
│ ├── beds_max (int)
|
||||
│ ├── baths_min (float)
|
||||
│ ├── baths_max (float)
|
||||
│ ├── sqft_min (int)
|
||||
│ ├── sqft_max (int)
|
||||
│ ├── price_min (int)
|
||||
│ ├── price_max (int)
|
||||
│ ├── area_min (int)
|
||||
│ └── unit_count (int)
|
||||
|
||||
├── Miscellaneous Details:
|
||||
│ ├── price_per_sqft (int)
|
||||
│ ├── mls_id (str)
|
||||
│ ├── agent_name (str)
|
||||
│ ├── img_src (str)
|
||||
│ ├── description (str)
|
||||
│ ├── status_text (str)
|
||||
│ ├── latitude (float)
|
||||
│ ├── longitude (float)
|
||||
│ └── posted_time (str) [Only for Zillow]
|
||||
│ └── posted_time (str)
|
||||
|
||||
├── Building Details (for property_type: building):
|
||||
│ ├── bldg_name (str)
|
||||
│ ├── bldg_unit_count (int)
|
||||
│ ├── bldg_min_beds (int)
|
||||
│ ├── bldg_min_baths (float)
|
||||
│ └── bldg_min_area (int)
|
||||
|
||||
└── Apartment Details (for property type: apartment):
|
||||
├── apt_min_beds: int
|
||||
├── apt_max_beds: int
|
||||
├── apt_min_baths: float
|
||||
├── apt_max_baths: float
|
||||
├── apt_min_price: int
|
||||
├── apt_max_price: int
|
||||
├── apt_min_sqft: int
|
||||
├── apt_max_sqft: int
|
||||
└── Location Details:
|
||||
├── latitude (float)
|
||||
└── longitude (float)
|
||||
```
|
||||
## Supported Countries for Property Scraping
|
||||
|
||||
@@ -144,7 +142,7 @@ The following exceptions may be raised when using HomeHarvest:
|
||||
- `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com`
|
||||
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
|
||||
- `NoResultsFound` - no properties found from your input
|
||||
- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the location you input
|
||||
- `GeoCoordsNotFound` - if Zillow scraper is not able to derive geo-coordinates from the location you input
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
|
||||
@@ -119,6 +119,7 @@ def scrape_property(
|
||||
site_name: Union[str, list[str]] = None,
|
||||
listing_type: str = "for_sale",
|
||||
proxy: str = None,
|
||||
keep_duplicates: bool = False
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Scrape property from various sites from a given location and listing type.
|
||||
@@ -165,5 +166,6 @@ def scrape_property(
|
||||
if col not in final_df.columns:
|
||||
final_df[col] = None
|
||||
|
||||
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
|
||||
if not keep_duplicates:
|
||||
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
|
||||
return final_df
|
||||
|
||||
@@ -42,11 +42,18 @@ def main():
|
||||
help="Name of the output file (without extension)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--keep_duplicates",
|
||||
action="store_true",
|
||||
help="Keep duplicate properties based on address"
|
||||
)
|
||||
|
||||
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy)
|
||||
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates)
|
||||
|
||||
if not args.filename:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
@@ -220,7 +220,14 @@ class RedfinScraper(Scraper):
|
||||
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000"
|
||||
response = self.session.get(url)
|
||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||
homes = [self._parse_home(home) for home in response_json["payload"]["homes"]] + [
|
||||
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
|
||||
]
|
||||
return homes
|
||||
|
||||
if "payload" in response_json:
|
||||
homes_list = response_json["payload"].get("homes", [])
|
||||
buildings_list = response_json["payload"].get("buildings", {}).values()
|
||||
|
||||
homes = [self._parse_home(home) for home in homes_list] + [
|
||||
self._parse_building(building) for building in buildings_list
|
||||
]
|
||||
return homes
|
||||
else:
|
||||
return []
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "homeharvest"
|
||||
version = "0.2.7"
|
||||
version = "0.2.10"
|
||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||
|
||||
Reference in New Issue
Block a user