Compare commits

...

23 Commits

Author SHA1 Message Date
Zachary Hampton
d775540afd - location bug fix 2024-03-06 16:31:06 -07:00
Cullen Watson
5ea9a6f6b6 docs: readme 2024-03-03 11:49:27 -06:00
robertomr100
ab6a0e3b6e Add foreclosure parameter (#55) 2024-03-03 11:45:28 -06:00
Zachary Hampton
03198428de Merge pull request #48 from Bunsly/for_rent_url
fix: rent url
2024-01-09 13:12:30 -07:00
Cullen Watson
70fa071318 fix: rent url 2024-01-08 12:46:31 -06:00
Cullen Watson
f7e74cf535 Merge pull request #44 from Bunsly/fix_postal_search
fix postal search to search just by zip
2023-12-02 00:40:13 -06:00
Cullen Watson
e17b976923 fix postal search to search just by zip 2023-12-02 00:39:28 -06:00
Zachary Hampton
ad13b55ea6 Update README.md 2023-11-30 11:48:48 -07:00
Cullen Watson
19f23c95c4 Merge pull request #43 from Bunsly/add_photos
Add photos
2023-11-24 21:40:34 -06:00
Cullen
4676ec9839 chore: remove test file 2023-11-24 13:42:52 -06:00
Cullen
6dd0b058d3 chore: version 2023-11-24 13:41:46 -06:00
Cullen
a74c1a9950 enh: add photos 2023-11-24 13:40:57 -06:00
Cullen Watson
fa507dbc72 docs: typo 2023-11-20 01:05:10 -06:00
Cullen Watson
5b6a9943cc Merge pull request #42 from Bunsly/street_dirction
fix: add street direction
2023-11-08 16:53:29 -06:00
Cullen Watson
9816defaf3 chore: version 2023-11-08 16:53:05 -06:00
Cullen Watson
f692b438b2 fix: add street direction 2023-11-08 16:52:06 -06:00
Zachary Hampton
30f48f54c8 Update README.md 2023-11-06 22:13:01 -07:00
Cullen Watson
7f86f69610 docs: readme 2023-11-03 18:53:46 -05:00
Cullen Watson
cc64dacdb0 docs: readme - date_from, date_to 2023-11-03 18:52:22 -05:00
Cullen Watson
d3268d8e5a Merge pull request #40 from Bunsly/date_range
Add date_to and date_from params
2023-11-03 18:42:13 -05:00
Cullen Watson
4edad901c5 [enh] date_to and date_from 2023-11-03 18:40:34 -05:00
Zachary Hampton
c597a78191 - None address bug fix 2023-10-18 16:32:43 -07:00
Zachary Hampton
11a7d854f0 - remove pending listings from for_sale 2023-10-18 14:41:41 -07:00
9 changed files with 236 additions and 122 deletions

View File

@@ -1,24 +1,16 @@
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400"> <img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library that extracts and formats data in the style of MLS listings. **HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings.
[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com). **Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to work with us.* *Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
Check out another project we wrote: ***[JobSpy](https://github.com/Bunsly/JobSpy)** a Python package for job scraping*
## HomeHarvest Features ## HomeHarvest Features
- **Source**: Fetches properties directly from **Realtor.com**. - **Source**: Fetches properties directly from **Realtor.com**.
- **Data Format**: Structures data to resemble MLS listings. - **Data Format**: Structures data to resemble MLS listings.
- **Export Flexibility**: Options to save as either CSV or Excel. - **Export Flexibility**: Options to save as either CSV or Excel.
- **Usage Modes**:
- **Python**: For those who'd like to integrate scraping into their Python scripts.
- **CLI**: For users who prefer command-line operations.
[Video Guide for HomeHarvest](https://youtu.be/J1qgNPgmSLI) - _updated for release v0.3.4_ [Video Guide for HomeHarvest](https://youtu.be/J1qgNPgmSLI) - _updated for release v0.3.4_
@@ -27,7 +19,7 @@ Check out another project we wrote: ***[JobSpy](https://github.com/Bunsly/JobSpy
## Installation ## Installation
```bash ```bash
pip install homeharvest pip install -U homeharvest
``` ```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
@@ -46,9 +38,13 @@ filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property( properties = scrape_property(
location="San Diego, CA", location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent, pending) listing_type="sold", # or (for_sale, for_rent, pending)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent) past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
# date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28",
# foreclosure=True
# mls_only=True, # only fetch MLS listings # mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
) )
print(f"Number of properties: {len(properties)}") print(f"Number of properties: {len(properties)}")
@@ -57,35 +53,6 @@ properties.to_csv(filename, index=False)
print(properties.head()) print(properties.head())
``` ```
### CLI
```
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location
Home Harvest Property Scraper
positional arguments:
location Location to scrape (e.g., San Francisco, CA)
options:
-l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending}
Listing type to scrape
-o {excel,csv}, --output {excel,csv}
Output format
-f FILENAME, --filename FILENAME
Name of the output file (without extension)
-p PROXY, --proxy PROXY
Proxy to use for scraping
-d DAYS, --days DAYS Sold/listed in last _ days filter.
-r RADIUS, --radius RADIUS
Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses.
-m, --mls_only If set, fetches only MLS listings.
```
```bash
homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
```
## Output ## Output
```plaintext ```plaintext
>>> properties.head() >>> properties.head()
@@ -115,11 +82,18 @@ Optional
├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale). ├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).
│ Example: 30 (fetches properties listed/sold in the last 30 days) │ Example: 30 (fetches properties listed/sold in the last 30 days)
├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
| (use this to get properties in chunks as there's a 10k result limit)
│ Format for both must be "YYYY-MM-DD".
│ Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) ├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
├── foreclosure (True/False): If set, fetches only foreclosures
└── proxy (string): In format 'http://user:pass@host:port' └── proxy (string): In format 'http://user:pass@host:port'
``` ```
### Property Schema ### Property Schema
```plaintext ```plaintext
Property Property
@@ -167,22 +141,5 @@ Property
The following exceptions may be raised when using HomeHarvest: The following exceptions may be raised when using HomeHarvest:
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
- `NoResultsFound` - no properties found from your search - `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD
## Frequently Asked Questions
---
**Q: Encountering issues with your searches?**
**A:** Try to broaden the parameters you're using. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues).
---
**Q: Received a Forbidden 403 response code?**
**A:** This indicates that you have been blocked by Realtor.com for sending too many requests. We recommend:
- Waiting a few seconds between requests.
- Trying a VPN or useing a proxy as a parameter to scrape_property() to change your IP address.
---

View File

@@ -1,10 +1,9 @@
import warnings import warnings
import pandas as pd import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input from .utils import process_result, ordered_properties, validate_input, validate_dates
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType from .core.scrapers.models import ListingType
from .exceptions import InvalidListingType, NoResultsFound
def scrape_property( def scrape_property(
@@ -14,6 +13,9 @@ def scrape_property(
mls_only: bool = False, mls_only: bool = False,
past_days: int = None, past_days: int = None,
proxy: str = None, proxy: str = None,
date_from: str = None,
date_to: str = None,
foreclosure: bool = None,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@@ -22,9 +24,11 @@ def scrape_property(
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days. :param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param proxy: Proxy to use for scraping :param proxy: Proxy to use for scraping
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
@@ -33,6 +37,9 @@ def scrape_property(
radius=radius, radius=radius,
mls_only=mls_only, mls_only=mls_only,
last_x_days=past_days, last_x_days=past_days,
date_from=date_from,
date_to=date_to,
foreclosure=foreclosure,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
@@ -40,7 +47,7 @@ def scrape_property(
properties_dfs = [process_result(result) for result in results] properties_dfs = [process_result(result) for result in results]
if not properties_dfs: if not properties_dfs:
raise NoResultsFound("no results found for the query") return pd.DataFrame()
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)

View File

@@ -11,6 +11,9 @@ class ScraperInput:
mls_only: bool | None = None mls_only: bool | None = None
proxy: str | None = None proxy: str | None = None
last_x_days: int | None = None last_x_days: int | None = None
date_from: str | None = None
date_to: str | None = None
foreclosure: bool | None = None
class Scraper: class Scraper:
@@ -36,6 +39,9 @@ class Scraper:
self.radius = scraper_input.radius self.radius = scraper_input.radius
self.last_x_days = scraper_input.last_x_days self.last_x_days = scraper_input.last_x_days
self.mls_only = scraper_input.mls_only self.mls_only = scraper_input.mls_only
self.date_from = scraper_input.date_from
self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure
def search(self) -> list[Property]: def search(self) -> list[Property]:
... ...

View File

@@ -34,6 +34,8 @@ class Address:
@dataclass @dataclass
class Description: class Description:
primary_photo: str | None = None
alt_photos: list[str] | None = None
style: str | None = None style: str | None = None
beds: int | None = None beds: int | None = None
baths_full: int | None = None baths_full: int | None = None

View File

@@ -9,7 +9,6 @@ from typing import Dict, Union, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .. import Scraper from .. import Scraper
from ....exceptions import NoResultsFound
from ..models import Property, Address, ListingType, Description from ..models import Property, Address, ListingType, Description
@@ -38,7 +37,7 @@ class RealtorScraper(Scraper):
result = response_json["autocomplete"] result = response_json["autocomplete"]
if not result: if not result:
raise NoResultsFound("No results found for location: " + self.location) return None
return result[0] return result[0]
@@ -50,6 +49,7 @@ class RealtorScraper(Scraper):
listing_id listing_id
} }
address { address {
street_direction
street_number street_number
street_name street_name
street_suffix street_suffix
@@ -84,6 +84,12 @@ class RealtorScraper(Scraper):
garage garage
permalink permalink
} }
primary_photo {
href
}
photos {
href
}
} }
}""" }"""
@@ -152,6 +158,9 @@ class RealtorScraper(Scraper):
else None, else None,
address=self._parse_address(property_info, search_type="handle_listing"), address=self._parse_address(property_info, search_type="handle_listing"),
description=Description( description=Description(
primary_photo=property_info["primary_photo"].get("href", "").replace("s.jpg",
"od-w480_h360_x2.webp?w=1080&q=75"),
alt_photos=self.process_alt_photos(property_info.get("photos", [])),
style=property_info["basic"].get("type", "").upper(), style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"), beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"), baths_full=property_info["basic"].get("baths_full"),
@@ -216,6 +225,7 @@ class RealtorScraper(Scraper):
stories stories
} }
address { address {
street_direction
street_number street_number
street_name street_name
street_suffix street_suffix
@@ -246,6 +256,12 @@ class RealtorScraper(Scraper):
units units
year_built year_built
} }
primary_photo {
href
}
photos {
href
}
} }
}""" }"""
@@ -273,7 +289,7 @@ class RealtorScraper(Scraper):
] ]
def general_search( def general_search(
self, variables: dict, search_type: str self, variables: dict, search_type: str
) -> Dict[str, Union[int, list[Property]]]: ) -> Dict[str, Union[int, list[Property]]]:
""" """
Handles a location area & returns a list of properties Handles a location area & returns a list of properties
@@ -316,6 +332,7 @@ class RealtorScraper(Scraper):
} }
location { location {
address { address {
street_direction
street_number street_number
street_name street_name
street_suffix street_suffix
@@ -332,19 +349,27 @@ class RealtorScraper(Scraper):
name name
} }
} }
primary_photo {
href
}
photos {
href
}
} }
} }
}""" }"""
date_param = ( date_param = ""
'sold_date: { min: "$today-%sD" }' % self.last_x_days if self.listing_type == ListingType.SOLD:
if self.listing_type == ListingType.SOLD and self.last_x_days if self.date_from and self.date_to:
else ( date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
'list_date: { min: "$today-%sD" }' % self.last_x_days elif self.last_x_days:
if self.last_x_days date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}'
else "" else:
) if self.date_from and self.date_to:
) date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
elif self.last_x_days:
date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}'
sort_param = ( sort_param = (
"sort: [{ field: sold_date, direction: desc }]" "sort: [{ field: sold_date, direction: desc }]"
@@ -359,6 +384,12 @@ class RealtorScraper(Scraper):
) )
listing_type = ListingType.FOR_SALE if self.listing_type == ListingType.PENDING else self.listing_type listing_type = ListingType.FOR_SALE if self.listing_type == ListingType.PENDING else self.listing_type
is_foreclosure = ""
if variables.get('foreclosure') is True:
is_foreclosure = "foreclosure: true"
elif variables.get('foreclosure') is False:
is_foreclosure = "foreclosure: false"
if search_type == "comps": #: comps search, came from an address if search_type == "comps": #: comps search, came from an address
query = """query Property_search( query = """query Property_search(
@@ -368,6 +399,7 @@ class RealtorScraper(Scraper):
) { ) {
home_search( home_search(
query: { query: {
%s
nearby: { nearby: {
coordinates: $coordinates coordinates: $coordinates
radius: $radius radius: $radius
@@ -380,6 +412,7 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s""" % ( ) %s""" % (
is_foreclosure,
listing_type.value.lower(), listing_type.value.lower(),
date_param, date_param,
pending_or_contingent_param, pending_or_contingent_param,
@@ -396,6 +429,7 @@ class RealtorScraper(Scraper):
) { ) {
home_search( home_search(
query: { query: {
%s
city: $city city: $city
county: $county county: $county
postal_code: $postal_code postal_code: $postal_code
@@ -408,6 +442,7 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s""" % ( ) %s""" % (
is_foreclosure,
listing_type.value.lower(), listing_type.value.lower(),
date_param, date_param,
pending_or_contingent_param, pending_or_contingent_param,
@@ -416,7 +451,7 @@ class RealtorScraper(Scraper):
) )
else: #: general search, came from an address else: #: general search, came from an address
query = ( query = (
"""query Property_search( """query Property_search(
$property_id: [ID]! $property_id: [ID]!
$offset: Int!, $offset: Int!,
) { ) {
@@ -427,7 +462,7 @@ class RealtorScraper(Scraper):
limit: 1 limit: 1
offset: $offset offset: $offset
) %s""" ) %s"""
% results_query % results_query
) )
payload = { payload = {
@@ -443,12 +478,12 @@ class RealtorScraper(Scraper):
properties: list[Property] = [] properties: list[Property] = []
if ( if (
response_json is None response_json is None
or "data" not in response_json or "data" not in response_json
or response_json["data"] is None or response_json["data"] is None
or search_key not in response_json["data"] or search_key not in response_json["data"]
or response_json["data"][search_key] is None or response_json["data"][search_key] is None
or "results" not in response_json["data"][search_key] or "results" not in response_json["data"][search_key]
): ):
return {"total": 0, "properties": []} return {"total": 0, "properties": []}
@@ -463,20 +498,23 @@ class RealtorScraper(Scraper):
continue continue
able_to_get_lat_long = ( able_to_get_lat_long = (
result result
and result.get("location") and result.get("location")
and result["location"].get("address") and result["location"].get("address")
and result["location"]["address"].get("coordinate") and result["location"]["address"].get("coordinate")
) )
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
if is_pending and self.listing_type != ListingType.PENDING:
continue
realty_property = Property( realty_property = Property(
mls=mls, mls=mls,
mls_id=result["source"].get("listing_id") mls_id=result["source"].get("listing_id")
if "source" in result and isinstance(result["source"], dict) if "source" in result and isinstance(result["source"], dict)
else None, else None,
property_url=f"{self.PROPERTY_URL}{result['property_id']}", property_url=f"{self.PROPERTY_URL}{result['property_id']}" if self.listing_type != ListingType.FOR_RENT else f"{self.PROPERTY_URL}M{result['property_id']}?listing_status=rental",
status="PENDING" if is_pending else result["status"].upper(), status="PENDING" if is_pending else result["status"].upper(),
list_price=result["list_price"], list_price=result["list_price"],
list_date=result["list_date"].split("T")[0] list_date=result["list_date"].split("T")[0]
@@ -506,6 +544,9 @@ class RealtorScraper(Scraper):
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
if not location_info:
return []
location_type = location_info["area_type"] location_type = location_info["area_type"]
search_variables = { search_variables = {
@@ -537,12 +578,20 @@ class RealtorScraper(Scraper):
return gql_results["properties"] return gql_results["properties"]
else: #: general search, comps (radius) else: #: general search, comps (radius)
if not location_info.get("centroid"):
return []
coordinates = list(location_info["centroid"].values()) coordinates = list(location_info["centroid"].values())
search_variables |= { search_variables |= {
"coordinates": coordinates, "coordinates": coordinates,
"radius": "{}mi".format(self.radius), "radius": "{}mi".format(self.radius),
} }
elif location_type == "postal_code":
search_variables |= {
"postal_code": location_info.get("postal_code"),
}
else: #: general search, location else: #: general search, location
search_variables |= { search_variables |= {
"city": location_info.get("city"), "city": location_info.get("city"),
@@ -551,6 +600,9 @@ class RealtorScraper(Scraper):
"postal_code": location_info.get("postal_code"), "postal_code": location_info.get("postal_code"),
} }
if self.foreclosure:
search_variables['foreclosure'] = self.foreclosure
result = self.general_search(search_variables, search_type=search_type) result = self.general_search(search_variables, search_type=search_type)
total = result["total"] total = result["total"]
homes = result["properties"] homes = result["properties"]
@@ -584,25 +636,34 @@ class RealtorScraper(Scraper):
return ", ".join(neighborhoods_list) if neighborhoods_list else None return ", ".join(neighborhoods_list) if neighborhoods_list else None
@staticmethod @staticmethod
def _parse_address(result: dict, search_type): def handle_none_safely(address_part):
if address_part is None:
return ""
return address_part
def _parse_address(self, result: dict, search_type):
if search_type == "general_search": if search_type == "general_search":
return Address( address = result['location']['address']
street=f"{result['location']['address']['street_number']} {result['location']['address']['street_name']} {result['location']['address']['street_suffix']}", else:
unit=result["location"]["address"]["unit"], address = result["address"]
city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"],
zip=result["location"]["address"]["postal_code"],
)
return Address( return Address(
street=f"{result['address']['street_number']} {result['address']['street_name']} {result['address']['street_suffix']}", street=" ".join([
unit=result["address"]["unit"], self.handle_none_safely(address.get('street_number')),
city=result["address"]["city"], self.handle_none_safely(address.get('street_direction')),
state=result["address"]["state_code"], self.handle_none_safely(address.get('street_name')),
zip=result["address"]["postal_code"], self.handle_none_safely(address.get('street_suffix')),
]).strip(),
unit=address["unit"],
city=address["city"],
state=address["state_code"],
zip=address["postal_code"],
) )
@staticmethod @staticmethod
def _parse_description(result: dict) -> Description: def _parse_description(result: dict) -> Description:
description_data = result.get("description", {}) description_data = result.get("description", {})
if description_data is None or not isinstance(description_data, dict): if description_data is None or not isinstance(description_data, dict):
@@ -612,7 +673,16 @@ class RealtorScraper(Scraper):
if style is not None: if style is not None:
style = style.upper() style = style.upper()
primary_photo = ""
if result and "primary_photo" in result:
primary_photo_info = result["primary_photo"]
if primary_photo_info and "href" in primary_photo_info:
primary_photo_href = primary_photo_info["href"]
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
return Description( return Description(
primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
style=style, style=style,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
@@ -643,3 +713,16 @@ class RealtorScraper(Scraper):
days = (today - list_date).days days = (today - list_date).days
if days >= 0: if days >= 0:
return days return days
@staticmethod
def process_alt_photos(photos_info):
try:
alt_photos = []
if photos_info:
for photo_info in photos_info:
href = photo_info.get("href", "")
alt_photo_href = href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
alt_photos.append(alt_photo_href)
return alt_photos
except Exception:
pass

View File

@@ -1,6 +1,5 @@
class InvalidListingType(Exception): class InvalidListingType(Exception):
"""Raised when a provided listing type is does not exist.""" """Raised when a provided listing type is does not exist."""
class InvalidDate(Exception):
class NoResultsFound(Exception): """Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23 """
"""Raised when no results are found for the given location"""

View File

@@ -1,6 +1,7 @@
from .core.scrapers.models import Property, ListingType
import pandas as pd import pandas as pd
from .exceptions import InvalidListingType from datetime import datetime
from .core.scrapers.models import Property, ListingType
from .exceptions import InvalidListingType, InvalidDate
ordered_properties = [ ordered_properties = [
"property_url", "property_url",
@@ -30,6 +31,8 @@ ordered_properties = [
"stories", "stories",
"hoa_fee", "hoa_fee",
"parking_garage", "parking_garage",
"primary_photo",
"alt_photos",
] ]
@@ -48,6 +51,8 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
description = result.description description = result.description
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["style"] = description.style prop_data["style"] = description.style
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
@@ -70,3 +75,18 @@ def validate_input(listing_type: str) -> None:
raise InvalidListingType( raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist." f"Provided listing type, '{listing_type}', does not exist."
) )
def validate_dates(date_from: str | None, date_to: str | None) -> None:
if (date_from is not None and date_to is None) or (date_from is None and date_to is not None):
raise InvalidDate("Both date_from and date_to must be provided.")
if date_from and date_to:
try:
date_from_obj = datetime.strptime(date_from, "%Y-%m-%d")
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")
if date_to_obj < date_from_obj:
raise InvalidDate("date_to must be after date_from.")
except ValueError as e:
raise InvalidDate(f"Invalid date format or range")

View File

@@ -1,8 +1,8 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.6" version = "0.3.13"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
readme = "README.md" readme = "README.md"
@@ -13,7 +13,6 @@ homeharvest = "homeharvest.cli:main"
python = ">=3.10,<3.13" python = ">=3.10,<3.13"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.1" pandas = "^2.1.1"
openpyxl = "^3.1.2"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@@ -1,7 +1,6 @@
from homeharvest import scrape_property from homeharvest import scrape_property
from homeharvest.exceptions import ( from homeharvest.exceptions import (
InvalidListingType, InvalidListingType,
NoResultsFound,
) )
@@ -50,6 +49,16 @@ def test_realtor_pending_comps():
assert len(set([len(result) for result in results])) == len(results) assert len(set([len(result) for result in results])) == len(results)
def test_realtor_sold_past():
result = scrape_property(
location="San Diego, CA",
past_days=30,
listing_type="sold",
)
assert result is not None and len(result) > 0
def test_realtor_comps(): def test_realtor_comps():
result = scrape_property( result = scrape_property(
location="2530 Al Lipscomb Way", location="2530 Al Lipscomb Way",
@@ -75,6 +84,20 @@ def test_realtor_last_x_days_sold():
) and len(days_result_30) != len(days_result_10) ) and len(days_result_30) != len(days_result_10)
def test_realtor_date_range_sold():
days_result_30 = scrape_property(
location="Dallas, TX", listing_type="sold", date_from="2023-05-01", date_to="2023-05-28"
)
days_result_60 = scrape_property(
location="Dallas, TX", listing_type="sold", date_from="2023-04-01", date_to="2023-06-10"
)
assert all(
[result is not None for result in [days_result_30, days_result_60]]
) and len(days_result_30) < len(days_result_60)
def test_realtor_single_property(): def test_realtor_single_property():
results = [ results = [
scrape_property( scrape_property(
@@ -107,15 +130,33 @@ def test_realtor():
assert all([result is not None for result in results]) assert all([result is not None for result in results])
bad_results = []
try: def test_realtor_city():
bad_results += [ results = scrape_property(
scrape_property( location="Atlanta, GA",
location="abceefg ju098ot498hh9", listing_type="for_sale",
listing_type="for_sale", )
)
] assert results is not None and len(results) > 0
except (InvalidListingType, NoResultsFound):
def test_realtor_bad_address():
bad_results = scrape_property(
location="abceefg ju098ot498hh9",
listing_type="for_sale",
)
if len(bad_results) == 0:
assert True assert True
assert all([result is None for result in bad_results])
def test_realtor_foreclosed():
foreclosed = scrape_property(
location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=True
)
not_foreclosed = scrape_property(
location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=False
)
assert len(foreclosed) != len(not_foreclosed)