Compare commits

..

11 Commits

Author SHA1 Message Date
Cullen Watson
19f23c95c4 Merge pull request #43 from Bunsly/add_photos
Add photos
2023-11-24 21:40:34 -06:00
Cullen
4676ec9839 chore: remove test file 2023-11-24 13:42:52 -06:00
Cullen
6dd0b058d3 chore: version 2023-11-24 13:41:46 -06:00
Cullen
a74c1a9950 enh: add photos 2023-11-24 13:40:57 -06:00
Cullen Watson
fa507dbc72 docs: typo 2023-11-20 01:05:10 -06:00
Cullen Watson
5b6a9943cc Merge pull request #42 from Bunsly/street_dirction
fix: add street direction
2023-11-08 16:53:29 -06:00
Cullen Watson
9816defaf3 chore: version 2023-11-08 16:53:05 -06:00
Cullen Watson
f692b438b2 fix: add street direction 2023-11-08 16:52:06 -06:00
Zachary Hampton
30f48f54c8 Update README.md 2023-11-06 22:13:01 -07:00
Cullen Watson
7f86f69610 docs: readme 2023-11-03 18:53:46 -05:00
Cullen Watson
cc64dacdb0 docs: readme - date_from, date_to 2023-11-03 18:52:22 -05:00
5 changed files with 61 additions and 6 deletions

View File

@@ -6,7 +6,7 @@
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com). **Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to work with us.* *Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
Check out another project we wrote: ***[JobSpy](https://github.com/Bunsly/JobSpy)** a Python package for job scraping* Check out another project we wrote: ***[JobSpy](https://github.com/Bunsly/JobSpy)** a Python package for job scraping*
@@ -91,15 +91,16 @@ Optional
├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale). ├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).
│ Example: 30 (fetches properties listed/sold in the last 30 days) │ Example: 30 (fetches properties listed/sold in the last 30 days)
├── date_range (string tuple): Start and end dates to filter properties listed or sold, both dates are required. ├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
} (use this to get properties in chunks as there's a 10k result limit) | (use this to get properties in chunks as there's a 10k result limit)
│ Format for both must be "YYYY-MM-DD". │ Format for both must be "YYYY-MM-DD".
│ Example: ("2023-05-01", "2023-05-15") (fetches properties listed/sold between these dates) │ Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) ├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
└── proxy (string): In format 'http://user:pass@host:port' └── proxy (string): In format 'http://user:pass@host:port'
``` ```
### CLI ### CLI
@@ -177,7 +178,7 @@ Property
The following exceptions may be raised when using HomeHarvest: The following exceptions may be raised when using HomeHarvest:
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
- `NoResultsFound` - no properties found from your search - `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD
## Frequently Asked Questions ## Frequently Asked Questions

View File

@@ -34,6 +34,8 @@ class Address:
@dataclass @dataclass
class Description: class Description:
primary_photo: str | None = None
alt_photos: list[str] | None = None
style: str | None = None style: str | None = None
beds: int | None = None beds: int | None = None
baths_full: int | None = None baths_full: int | None = None

View File

@@ -49,6 +49,7 @@ class RealtorScraper(Scraper):
listing_id listing_id
} }
address { address {
street_direction
street_number street_number
street_name street_name
street_suffix street_suffix
@@ -83,6 +84,12 @@ class RealtorScraper(Scraper):
garage garage
permalink permalink
} }
primary_photo {
href
}
photos {
href
}
} }
}""" }"""
@@ -151,6 +158,8 @@ class RealtorScraper(Scraper):
else None, else None,
address=self._parse_address(property_info, search_type="handle_listing"), address=self._parse_address(property_info, search_type="handle_listing"),
description=Description( description=Description(
primary_photo=property_info["primary_photo"].get("href", "").replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75"),
alt_photos=self.process_alt_photos(property_info.get("photos", [])),
style=property_info["basic"].get("type", "").upper(), style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"), beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"), baths_full=property_info["basic"].get("baths_full"),
@@ -215,6 +224,7 @@ class RealtorScraper(Scraper):
stories stories
} }
address { address {
street_direction
street_number street_number
street_name street_name
street_suffix street_suffix
@@ -245,6 +255,12 @@ class RealtorScraper(Scraper):
units units
year_built year_built
} }
primary_photo {
href
}
photos {
href
}
} }
}""" }"""
@@ -315,6 +331,7 @@ class RealtorScraper(Scraper):
} }
location { location {
address { address {
street_direction
street_number street_number
street_name street_name
street_suffix street_suffix
@@ -331,6 +348,12 @@ class RealtorScraper(Scraper):
name name
} }
} }
primary_photo {
href
}
photos {
href
}
} }
} }
}""" }"""
@@ -606,6 +629,7 @@ class RealtorScraper(Scraper):
return Address( return Address(
street=" ".join([ street=" ".join([
self.handle_none_safely(address.get('street_number')), self.handle_none_safely(address.get('street_number')),
self.handle_none_safely(address.get('street_direction')),
self.handle_none_safely(address.get('street_name')), self.handle_none_safely(address.get('street_name')),
self.handle_none_safely(address.get('street_suffix')), self.handle_none_safely(address.get('street_suffix')),
]).strip(), ]).strip(),
@@ -617,6 +641,7 @@ class RealtorScraper(Scraper):
@staticmethod @staticmethod
def _parse_description(result: dict) -> Description: def _parse_description(result: dict) -> Description:
description_data = result.get("description", {}) description_data = result.get("description", {})
if description_data is None or not isinstance(description_data, dict): if description_data is None or not isinstance(description_data, dict):
@@ -626,7 +651,16 @@ class RealtorScraper(Scraper):
if style is not None: if style is not None:
style = style.upper() style = style.upper()
primary_photo = ""
if result and "primary_photo" in result:
primary_photo_info = result["primary_photo"]
if primary_photo_info and "href" in primary_photo_info:
primary_photo_href = primary_photo_info["href"]
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
return Description( return Description(
primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
style=style, style=style,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
@@ -639,6 +673,7 @@ class RealtorScraper(Scraper):
stories=description_data.get("stories"), stories=description_data.get("stories"),
) )
@staticmethod @staticmethod
def calculate_days_on_mls(result: dict) -> Optional[int]: def calculate_days_on_mls(result: dict) -> Optional[int]:
list_date_str = result.get("list_date") list_date_str = result.get("list_date")
@@ -657,3 +692,16 @@ class RealtorScraper(Scraper):
days = (today - list_date).days days = (today - list_date).days
if days >= 0: if days >= 0:
return days return days
@staticmethod
def process_alt_photos(photos_info):
try:
alt_photos = []
if photos_info:
for photo_info in photos_info:
href = photo_info.get("href", "")
alt_photo_href = href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
alt_photos.append(alt_photo_href)
return alt_photos
except Exception:
pass

View File

@@ -5,6 +5,8 @@ from .exceptions import InvalidListingType, InvalidDate
ordered_properties = [ ordered_properties = [
"property_url", "property_url",
"primary_photo",
"alt_photos",
"mls", "mls",
"mls_id", "mls_id",
"status", "status",
@@ -49,6 +51,8 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
description = result.description description = result.description
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["style"] = description.style prop_data["style"] = description.style
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.8" version = "0.3.10"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"