mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-06 20:44:28 -08:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f7e74cf535 | ||
|
|
e17b976923 | ||
|
|
ad13b55ea6 | ||
|
|
19f23c95c4 | ||
|
|
4676ec9839 | ||
|
|
6dd0b058d3 | ||
|
|
a74c1a9950 | ||
|
|
fa507dbc72 |
@@ -2,14 +2,10 @@
|
|||||||
|
|
||||||
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library that extracts and formats data in the style of MLS listings.
|
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library that extracts and formats data in the style of MLS listings.
|
||||||
|
|
||||||
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
|
||||||
|
|
||||||
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
|
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
|
||||||
|
|
||||||
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
|
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
|
||||||
|
|
||||||
Check out another project we wrote: ***[JobSpy](https://github.com/Bunsly/JobSpy)** – a Python package for job scraping*
|
|
||||||
|
|
||||||
## HomeHarvest Features
|
## HomeHarvest Features
|
||||||
|
|
||||||
- **Source**: Fetches properties directly from **Realtor.com**.
|
- **Source**: Fetches properties directly from **Realtor.com**.
|
||||||
@@ -92,7 +88,7 @@ Optional
|
|||||||
│ Example: 30 (fetches properties listed/sold in the last 30 days)
|
│ Example: 30 (fetches properties listed/sold in the last 30 days)
|
||||||
│
|
│
|
||||||
├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
|
├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
|
||||||
} (use this to get properties in chunks as there's a 10k result limit)
|
| (use this to get properties in chunks as there's a 10k result limit)
|
||||||
│ Format for both must be "YYYY-MM-DD".
|
│ Format for both must be "YYYY-MM-DD".
|
||||||
│ Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
|
│ Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
|
||||||
│
|
│
|
||||||
@@ -193,7 +189,7 @@ The following exceptions may be raised when using HomeHarvest:
|
|||||||
**A:** This indicates that you have been blocked by Realtor.com for sending too many requests. We recommend:
|
**A:** This indicates that you have been blocked by Realtor.com for sending too many requests. We recommend:
|
||||||
|
|
||||||
- Waiting a few seconds between requests.
|
- Waiting a few seconds between requests.
|
||||||
- Trying a VPN or useing a proxy as a parameter to scrape_property() to change your IP address.
|
- Trying a VPN or using a proxy as a parameter to scrape_property() to change your IP address.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,8 @@ class Address:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Description:
|
class Description:
|
||||||
|
primary_photo: str | None = None
|
||||||
|
alt_photos: list[str] | None = None
|
||||||
style: str | None = None
|
style: str | None = None
|
||||||
beds: int | None = None
|
beds: int | None = None
|
||||||
baths_full: int | None = None
|
baths_full: int | None = None
|
||||||
|
|||||||
@@ -84,6 +84,12 @@ class RealtorScraper(Scraper):
|
|||||||
garage
|
garage
|
||||||
permalink
|
permalink
|
||||||
}
|
}
|
||||||
|
primary_photo {
|
||||||
|
href
|
||||||
|
}
|
||||||
|
photos {
|
||||||
|
href
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
@@ -152,6 +158,8 @@ class RealtorScraper(Scraper):
|
|||||||
else None,
|
else None,
|
||||||
address=self._parse_address(property_info, search_type="handle_listing"),
|
address=self._parse_address(property_info, search_type="handle_listing"),
|
||||||
description=Description(
|
description=Description(
|
||||||
|
primary_photo=property_info["primary_photo"].get("href", "").replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75"),
|
||||||
|
alt_photos=self.process_alt_photos(property_info.get("photos", [])),
|
||||||
style=property_info["basic"].get("type", "").upper(),
|
style=property_info["basic"].get("type", "").upper(),
|
||||||
beds=property_info["basic"].get("beds"),
|
beds=property_info["basic"].get("beds"),
|
||||||
baths_full=property_info["basic"].get("baths_full"),
|
baths_full=property_info["basic"].get("baths_full"),
|
||||||
@@ -247,6 +255,12 @@ class RealtorScraper(Scraper):
|
|||||||
units
|
units
|
||||||
year_built
|
year_built
|
||||||
}
|
}
|
||||||
|
primary_photo {
|
||||||
|
href
|
||||||
|
}
|
||||||
|
photos {
|
||||||
|
href
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
@@ -334,6 +348,12 @@ class RealtorScraper(Scraper):
|
|||||||
name
|
name
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
primary_photo {
|
||||||
|
href
|
||||||
|
}
|
||||||
|
photos {
|
||||||
|
href
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}"""
|
}"""
|
||||||
@@ -553,6 +573,11 @@ class RealtorScraper(Scraper):
|
|||||||
"radius": "{}mi".format(self.radius),
|
"radius": "{}mi".format(self.radius),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
elif location_type == "postal_code":
|
||||||
|
search_variables |= {
|
||||||
|
"postal_code": location_info.get("postal_code"),
|
||||||
|
}
|
||||||
|
|
||||||
else: #: general search, location
|
else: #: general search, location
|
||||||
search_variables |= {
|
search_variables |= {
|
||||||
"city": location_info.get("city"),
|
"city": location_info.get("city"),
|
||||||
@@ -621,6 +646,7 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_description(result: dict) -> Description:
|
def _parse_description(result: dict) -> Description:
|
||||||
|
|
||||||
description_data = result.get("description", {})
|
description_data = result.get("description", {})
|
||||||
|
|
||||||
if description_data is None or not isinstance(description_data, dict):
|
if description_data is None or not isinstance(description_data, dict):
|
||||||
@@ -630,7 +656,16 @@ class RealtorScraper(Scraper):
|
|||||||
if style is not None:
|
if style is not None:
|
||||||
style = style.upper()
|
style = style.upper()
|
||||||
|
|
||||||
|
primary_photo = ""
|
||||||
|
if result and "primary_photo" in result:
|
||||||
|
primary_photo_info = result["primary_photo"]
|
||||||
|
if primary_photo_info and "href" in primary_photo_info:
|
||||||
|
primary_photo_href = primary_photo_info["href"]
|
||||||
|
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
|
||||||
|
|
||||||
return Description(
|
return Description(
|
||||||
|
primary_photo=primary_photo,
|
||||||
|
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
|
||||||
style=style,
|
style=style,
|
||||||
beds=description_data.get("beds"),
|
beds=description_data.get("beds"),
|
||||||
baths_full=description_data.get("baths_full"),
|
baths_full=description_data.get("baths_full"),
|
||||||
@@ -643,6 +678,7 @@ class RealtorScraper(Scraper):
|
|||||||
stories=description_data.get("stories"),
|
stories=description_data.get("stories"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def calculate_days_on_mls(result: dict) -> Optional[int]:
|
def calculate_days_on_mls(result: dict) -> Optional[int]:
|
||||||
list_date_str = result.get("list_date")
|
list_date_str = result.get("list_date")
|
||||||
@@ -661,3 +697,16 @@ class RealtorScraper(Scraper):
|
|||||||
days = (today - list_date).days
|
days = (today - list_date).days
|
||||||
if days >= 0:
|
if days >= 0:
|
||||||
return days
|
return days
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def process_alt_photos(photos_info):
|
||||||
|
try:
|
||||||
|
alt_photos = []
|
||||||
|
if photos_info:
|
||||||
|
for photo_info in photos_info:
|
||||||
|
href = photo_info.get("href", "")
|
||||||
|
alt_photo_href = href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
|
||||||
|
alt_photos.append(alt_photo_href)
|
||||||
|
return alt_photos
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|||||||
@@ -31,6 +31,8 @@ ordered_properties = [
|
|||||||
"stories",
|
"stories",
|
||||||
"hoa_fee",
|
"hoa_fee",
|
||||||
"parking_garage",
|
"parking_garage",
|
||||||
|
"primary_photo",
|
||||||
|
"alt_photos",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -49,6 +51,8 @@ def process_result(result: Property) -> pd.DataFrame:
|
|||||||
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
|
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
|
||||||
|
|
||||||
description = result.description
|
description = result.description
|
||||||
|
prop_data["primary_photo"] = description.primary_photo
|
||||||
|
prop_data["alt_photos"] = ", ".join(description.alt_photos)
|
||||||
prop_data["style"] = description.style
|
prop_data["style"] = description.style
|
||||||
prop_data["beds"] = description.beds
|
prop_data["beds"] = description.beds
|
||||||
prop_data["full_baths"] = description.baths_full
|
prop_data["full_baths"] = description.baths_full
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.3.9"
|
version = "0.3.11"
|
||||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/Bunsly/HomeHarvest"
|
homepage = "https://github.com/Bunsly/HomeHarvest"
|
||||||
|
|||||||
Reference in New Issue
Block a user