diff --git a/.gitignore b/.gitignore index 1f97e4a..41dd5d2 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ **/__pycache__/ **/.pytest_cache/ *.pyc -/.ipynb_checkpoints/ \ No newline at end of file +/.ipynb_checkpoints/ +*.csv \ No newline at end of file diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 728a8c2..5d21217 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -38,6 +38,13 @@ def get_ordered_properties(result: Property) -> list[str]: "currency", "price", "apt_min_price", + "apt_max_price", + "apt_min_sqft", + "apt_max_sqft", + "apt_min_beds", + "apt_max_beds", + "apt_min_baths", + "apt_max_baths", "tax_assessed_value", "square_feet", "price_per_sqft", diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 8385405..f87eb43 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -102,4 +102,11 @@ class Property: bldg_min_area: int | None = None # apt + apt_min_beds: int | None = None + apt_max_beds: int | None = None + apt_min_baths: float | None = None + apt_max_baths: float | None = None apt_min_price: int | None = None + apt_max_price: int | None = None + apt_min_sqft: int | None = None + apt_max_sqft: int | None = None diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index 3e4fbde..d591fce 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -2,7 +2,7 @@ import json from typing import Any from .. import Scraper from ....utils import parse_address_two, parse_unit -from ..models import Property, Address, PropertyType +from ..models import Property, Address, PropertyType, ListingType, SiteName from ....exceptions import NoResultsFound @@ -108,6 +108,64 @@ class RedfinScraper(Scraper): else None, ) + def _handle_rentals(self, region_id, region_type): + url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true®ion_id={region_id}®ion_type={region_type}" + + response = self.session.get(url) + response.raise_for_status() + homes = response.json() + + properties_list = [] + + for home in homes["homes"]: + home_data = home["homeData"] + rental_data = home["rentalExtension"] + + property_url = f"https://www.redfin.com{home_data.get('url', '')}" + address_info = home_data.get("addressInfo", {}) + centroid = address_info.get("centroid", {}).get("centroid", {}) + address = Address( + street_address=address_info.get("formattedStreetLine", None), + city=address_info.get("city", None), + state=address_info.get("state", None), + zip_code=address_info.get("zip", None), + unit=None, + country="US" if address_info.get("countryCode", None) == 1 else None, + ) + + price_range = rental_data.get("rentPriceRange", {"min": None, "max": None}) + bed_range = rental_data.get("bedRange", {"min": None, "max": None}) + bath_range = rental_data.get("bathRange", {"min": None, "max": None}) + sqft_range = rental_data.get("sqftRange", {"min": None, "max": None}) + + property_ = Property( + property_url=property_url, + site_name=SiteName.REDFIN, + listing_type=ListingType.FOR_RENT, + address=address, + apt_min_beds=bed_range.get("min", None), + apt_min_baths=bath_range.get("min", None), + apt_max_beds=bed_range.get("max", None), + apt_max_baths=bath_range.get("max", None), + description=rental_data.get("description", None), + latitude=centroid.get("latitude", None), + longitude=centroid.get("longitude", None), + apt_min_price=price_range.get("min", None), + apt_max_price=price_range.get("max", None), + apt_min_sqft=sqft_range.get("min", None), + apt_max_sqft=sqft_range.get("max", None), + img_src=home_data.get("staticMapUrl", None), + posted_time=rental_data.get("lastUpdated", None), + bldg_name=rental_data.get("propertyName", None), + ) + + properties_list.append(property_) + + if not properties_list: + raise NoResultsFound("No rentals found for the given location.") + + return properties_list + def _parse_building(self, building: dict) -> Property: street_address = " ".join( [ @@ -168,18 +226,19 @@ class RedfinScraper(Scraper): home_id = region_id return self.handle_address(home_id) - url = "https://www.redfin.com/stingray/api/gis?al=1®ion_id={}®ion_type={}".format( - region_id, region_type - ) - - response = self.session.get(url) - response_json = json.loads(response.text.replace("{}&&", "")) - - homes = [ - self._parse_home(home) for home in response_json["payload"]["homes"] - ] + [ - self._parse_building(building) - for building in response_json["payload"]["buildings"].values() - ] - - return homes + if self.listing_type == ListingType.FOR_RENT: + return self._handle_rentals(region_id, region_type) + else: + if self.listing_type == ListingType.FOR_SALE: + url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}" + else: + url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30" + response = self.session.get(url) + response_json = json.loads(response.text.replace("{}&&", "")) + homes = [ + self._parse_home(home) for home in response_json["payload"]["homes"] + ] + [ + self._parse_building(building) + for building in response_json["payload"]["buildings"].values() + ] + return homes diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 646872d..3eb108c 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -22,14 +22,14 @@ class ZillowScraper(Scraper): self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/" def is_plausible_location(self, location: str) -> bool: - url = ('https://www.zillowstatic.com/autocomplete/v3/suggestions?q={' - '}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render').format( - location - ) + url = ( + "https://www.zillowstatic.com/autocomplete/v3/suggestions?q={" + "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render" + ).format(location) response = self.session.get(url) - return response.json()['results'] != [] + return response.json()["results"] != [] def search(self): resp = self.session.get(self.url, headers=self._get_headers())