From ff95ca06112b03c788962ad28c10e4d673680cfe Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Tue, 3 Oct 2023 22:15:07 -0500 Subject: [PATCH] [fix] add back zillow/redfin --- homeharvest/__init__.py | 20 +- homeharvest/core/scrapers/__init__.py | 2 + homeharvest/core/scrapers/realtor/__init__.py | 26 +- homeharvest/core/scrapers/redfin/__init__.py | 228 +++++++++++++ homeharvest/core/scrapers/zillow/__init__.py | 308 ++++++++++++++++++ 5 files changed, 573 insertions(+), 11 deletions(-) create mode 100644 homeharvest/core/scrapers/redfin/__init__.py create mode 100644 homeharvest/core/scrapers/zillow/__init__.py diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index ff813a7..b0c3779 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -24,7 +24,12 @@ def _validate_input(site_name: str, status: str) -> None: def _scrape_single_site( - location: str, site_name: str, status: str, proxy: str = None, timeframe: str = None + location: str, + site_name: str, + status: str, + radius: float, + proxy: str = None, + timeframe: str = None, ) -> pd.DataFrame: """ Helper function to scrape a single site. @@ -36,6 +41,7 @@ def _scrape_single_site( status=status, site_name=SiteName.get_by_value(site_name.lower()), proxy=proxy, + radius=radius, timeframe=timeframe, ) @@ -53,7 +59,8 @@ def scrape_property( location: str, timeframe: str = None, site_name: Union[str, list[str]] = None, - status: str = "sale", + listing_type: str = "for_sale", + radius: float = None, proxy: str = None, ) -> pd.DataFrame: """ @@ -65,6 +72,7 @@ def scrape_property( :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold') :return: pd.DataFrame containing properties """ + status = listing_type if site_name is None: site_name = list(_scrapers.keys()) @@ -80,7 +88,13 @@ def scrape_property( with ThreadPoolExecutor() as executor: futures = { executor.submit( - _scrape_single_site, location, s_name, status, proxy, timeframe + _scrape_single_site, + location, + s_name, + status, + radius, + proxy, + timeframe, ): s_name for s_name in site_name } diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 1eb9fd3..735428a 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -16,6 +16,7 @@ class ScraperInput: site_name: str proxy: Optional[str] = None timeframe: Optional[str] = None + radius: float | None = None def __post_init__(self): if self.status == "sold" and not self.timeframe: @@ -50,6 +51,7 @@ class Scraper: self.listing_type = scraper_input.status self.site_name = scraper_input.site_name + self.radius = scraper_input.radius def search(self) -> list[Property]: ... diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index b93fa13..23eed58 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -590,19 +590,29 @@ class RealtorScraper(Scraper): def search(self): location_info = self.handle_location() location_type = location_info["area_type"] + is_for_comps = self.radius is not None and location_type == "address" - if location_type == "address": + if location_type == "address" and not is_for_comps: property_id = location_info["mpr_id"] return self.handle_address(property_id) offset = 0 - search_variables = { - "city": location_info.get("city"), - "county": location_info.get("county"), - "state_code": location_info.get("state_code"), - "postal_code": location_info.get("postal_code"), - "offset": offset, - } + + if not is_for_comps: + search_variables = { + "city": location_info.get("city"), + "county": location_info.get("county"), + "state_code": location_info.get("state_code"), + "postal_code": location_info.get("postal_code"), + "offset": offset, + } + else: + coordinates = list(location_info["centroid"].values()) + search_variables = { + "coordinates": coordinates, + "radius": "{}mi".format(self.radius), + "offset": offset, + } result = self.handle_area(search_variables) total = result["total"] diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py new file mode 100644 index 0000000..1fbcd38 --- /dev/null +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -0,0 +1,228 @@ +""" +homeharvest.redfin.__init__ +~~~~~~~~~~~~ + +This module implements the scraper for redfin.com +""" +import json +from typing import Any +from .. import Scraper +from ..models import Property, Address, Status +from ....exceptions import NoResultsFound, SearchTooBroad +from datetime import datetime + + +class RedfinScraper(Scraper): + def __init__(self, scraper_input): + super().__init__(scraper_input) + self.listing_type = scraper_input.status + + def _handle_location(self): + url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format( + self.location + ) + + response = self.session.get(url) + response_json = json.loads(response.text.replace("{}&&", "")) + + def get_region_type(match_type: str): + if match_type == "4": + return "2" #: zip + elif match_type == "2": + return "6" #: city + elif match_type == "1": + return "address" #: address, needs to be handled differently + elif match_type == "11": + return "state" + + if "exactMatch" not in response_json["payload"]: + raise NoResultsFound( + "No results found for location: {}".format(self.location) + ) + + if response_json["payload"]["exactMatch"] is not None: + target = response_json["payload"]["exactMatch"] + else: + target = response_json["payload"]["sections"][0]["rows"][0] + + return target["id"].split("_")[1], get_region_type(target["type"]) + + def _parse_home(self, home: dict, single_search: bool = False) -> Property: + def get_value(key: str) -> Any | None: + if key in home and "value" in home[key]: + return home[key]["value"] + + if not single_search: + address = Address( + street=get_value("streetLine"), + city=home.get("city"), + state=home.get("state"), + zip=home.get("zip"), + ) + else: + address_info = home.get("streetAddress") + + address = Address( + street=address_info.get("assembledAddress"), + city=home.get("city"), + state=home.get("state"), + zip=home.get("zip"), + ) + + url = "https://www.redfin.com{}".format(home["url"]) + lot_size_data = home.get("lotSize") + + if not isinstance(lot_size_data, int): + lot_size = ( + lot_size_data.get("value", None) + if isinstance(lot_size_data, dict) + else None + ) + else: + lot_size = lot_size_data + + lat_long = get_value("latLong") + + return Property( + status=self.listing_type, + address=address, + property_url=url, + beds=home["beds"] if "beds" in home else None, + baths_full=home["baths"] if "baths" in home else None, + list_price=get_value("price"), + est_sf=get_value("sqFt"), + stories=home["stories"] if "stories" in home else None, + yr_blt=get_value("yearBuilt") + if not single_search + else home.get("yearBuilt"), + lot_sf=lot_size, + prc_sqft=get_value("pricePerSqFt") + if type(home.get("pricePerSqFt")) != int + else home.get("pricePerSqFt"), + mls_id=get_value("mlsId"), + latitude=lat_long.get("latitude") if lat_long else None, + longitude=lat_long.get("longitude") if lat_long else None, + last_sold_date=datetime.fromtimestamp(home["soldDate"] / 1000) + if "soldDate" in home + else None, + ) + + def _handle_rentals(self, region_id, region_type): + url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true®ion_id={region_id}®ion_type={region_type}&num_homes=100000" + + response = self.session.get(url) + response.raise_for_status() + homes = response.json() + + properties_list = [] + + for home in homes["homes"]: + home_data = home["homeData"] + rental_data = home["rentalExtension"] + + property_url = f"https://www.redfin.com{home_data.get('url', '')}" + address_info = home_data.get("addressInfo", {}) + centroid = address_info.get("centroid", {}).get("centroid", {}) + address = Address( + street=address_info.get("formattedStreetLine"), + city=address_info.get("city"), + state=address_info.get("state"), + zip=address_info.get("zip"), + ) + + price_range = rental_data.get("rentPriceRange", {"min": None, "max": None}) + bed_range = rental_data.get("bedRange", {"min": None, "max": None}) + bath_range = rental_data.get("bathRange", {"min": None, "max": None}) + sqft_range = rental_data.get("sqftRange", {"min": None, "max": None}) + + property_ = Property( + property_url=property_url, + status=Status.FOR_RENT.value, + address=address, + latitude=centroid.get("latitude"), + longitude=centroid.get("longitude"), + baths_full=bath_range.get("min"), + beds=bed_range.get("min"), + list_price=price_range.get("min"), + est_sf=sqft_range.get("min"), + ) + + properties_list.append(property_) + + if not properties_list: + raise NoResultsFound("No rentals found for the given location.") + + return properties_list + + def _parse_building(self, building: dict) -> Property: + street_address = " ".join( + [ + building["address"]["streetNumber"], + building["address"]["directionalPrefix"], + building["address"]["streetName"], + building["address"]["streetType"], + ] + ) + return Property( + status=self.status, + address=Address( + street=street_address, + city=building["address"]["city"], + state=building["address"]["stateOrProvinceCode"], + zip=building["address"]["postalCode"], + ), + property_url="https://www.redfin.com{}".format(building["url"]), + ) + + def handle_address(self, home_id: str): + """ + EPs: + https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694 + https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3 + https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3 + https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3 + """ + url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format( + home_id + ) + + response = self.session.get(url) + response_json = json.loads(response.text.replace("{}&&", "")) + + parsed_home = self._parse_home( + response_json["payload"]["addressSectionInfo"], single_search=True + ) + return [parsed_home] + + def search(self): + region_id, region_type = self._handle_location() + + if region_type == "state": + raise SearchTooBroad( + "State searches are not supported, please use a more specific location." + ) + + if region_type == "address": + home_id = region_id + return self.handle_address(home_id) + + if self.listing_type == Status.FOR_RENT: + return self._handle_rentals(region_id, region_type) + else: + if self.listing_type == Status.FOR_SALE: + url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&num_homes=100000" + else: + url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000" + response = self.session.get(url) + response_json = json.loads(response.text.replace("{}&&", "")) + + if "payload" in response_json: + homes_list = response_json["payload"].get("homes", []) + buildings_list = response_json["payload"].get("buildings", {}).values() + + homes = [self._parse_home(home) for home in homes_list] + [ + self._parse_building(building) for building in buildings_list + ] + return homes + else: + return [] diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py new file mode 100644 index 0000000..dc58759 --- /dev/null +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -0,0 +1,308 @@ +""" +homeharvest.zillow.__init__ +~~~~~~~~~~~~ + +This module implements the scraper for zillow.com +""" +import re +import json + +import tls_client + +from .. import Scraper +from requests.exceptions import HTTPError +from ....exceptions import GeoCoordsNotFound, NoResultsFound +from ..models import Property, Address, Status +import urllib.parse +from datetime import datetime, timedelta + + +class ZillowScraper(Scraper): + def __init__(self, scraper_input): + session = tls_client.Session( + client_identifier="chrome112", random_tls_extension_order=True + ) + + super().__init__(scraper_input, session) + + self.session.headers.update( + { + "authority": "www.zillow.com", + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "accept-language": "en-US,en;q=0.9", + "cache-control": "max-age=0", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36", + } + ) + + if not self.is_plausible_location(self.location): + raise NoResultsFound("Invalid location input: {}".format(self.location)) + + listing_type_to_url_path = { + Status.FOR_SALE: "for_sale", + Status.FOR_RENT: "for_rent", + Status.SOLD: "recently_sold", + } + + self.url = f"https://www.zillow.com/homes/{listing_type_to_url_path[self.listing_type]}/{self.location}_rb/" + + def is_plausible_location(self, location: str) -> bool: + url = ( + "https://www.zillowstatic.com/autocomplete/v3/suggestions?q={" + "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render" + ).format(urllib.parse.quote(location)) + + resp = self.session.get(url) + + return resp.json()["results"] != [] + + def search(self): + resp = self.session.get(self.url) + if resp.status_code != 200: + raise HTTPError(f"bad response status code: {resp.status_code}") + content = resp.text + + match = re.search( + r'', + content, + re.DOTALL, + ) + if not match: + raise NoResultsFound( + "No results were found for Zillow with the given Location." + ) + + json_str = match.group(1) + data = json.loads(json_str) + + if "searchPageState" in data["props"]["pageProps"]: + pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};' + + match = re.search(pattern, content) + + if match: + coords = [float(coord) for coord in match.groups()] + return self._fetch_properties_backend(coords) + + else: + raise GeoCoordsNotFound("Box bounds could not be located.") + + elif "gdpClientCache" in data["props"]["pageProps"]: + gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"]) + main_key = list(gdp_client_cache.keys())[0] + + property_data = gdp_client_cache[main_key]["property"] + property = self._get_single_property_page(property_data) + + return [property] + raise NoResultsFound("Specific property data not found in the response.") + + def _fetch_properties_backend(self, coords): + url = "https://www.zillow.com/async-create-search-page-state" + + filter_state_for_sale = { + "sortSelection": { + # "value": "globalrelevanceex" + "value": "days" + }, + "isAllHomes": {"value": True}, + } + + filter_state_for_rent = { + "isForRent": {"value": True}, + "isForSaleByAgent": {"value": False}, + "isForSaleByOwner": {"value": False}, + "isNewConstruction": {"value": False}, + "isComingSoon": {"value": False}, + "isAuction": {"value": False}, + "isForSaleForeclosure": {"value": False}, + "isAllHomes": {"value": True}, + } + + filter_state_sold = { + "isRecentlySold": {"value": True}, + "isForSaleByAgent": {"value": False}, + "isForSaleByOwner": {"value": False}, + "isNewConstruction": {"value": False}, + "isComingSoon": {"value": False}, + "isAuction": {"value": False}, + "isForSaleForeclosure": {"value": False}, + "isAllHomes": {"value": True}, + } + + selected_filter = ( + filter_state_for_rent + if self.listing_type == Status.FOR_RENT + else filter_state_for_sale + if self.listing_type == Status.FOR_SALE + else filter_state_sold + ) + + payload = { + "searchQueryState": { + "pagination": {}, + "isMapVisible": True, + "mapBounds": { + "west": coords[0], + "east": coords[1], + "south": coords[2], + "north": coords[3], + }, + "filterState": selected_filter, + "isListVisible": True, + "mapZoom": 11, + }, + "wants": {"cat1": ["mapResults"]}, + "isDebugRequest": False, + } + resp = self.session.put(url, json=payload) + if resp.status_code != 200: + raise HTTPError(f"bad response status code: {resp.status_code}") + return self._parse_properties(resp.json()) + + @staticmethod + def parse_posted_time(time: str) -> datetime: + int_time = int(time.split(" ")[0]) + + if "hour" in time: + return datetime.now() - timedelta(hours=int_time) + + if "day" in time: + return datetime.now() - timedelta(days=int_time) + + def _parse_properties(self, property_data: dict): + mapresults = property_data["cat1"]["searchResults"]["mapResults"] + + properties_list = [] + + for result in mapresults: + if "hdpData" in result: + home_info = result["hdpData"]["homeInfo"] + address_data = { + "streeet": home_info.get("streetAddress"), + "city": home_info.get("city"), + "state": home_info.get("state"), + "zip": home_info.get("zipcode"), + } + property_obj = Property( + address=Address(**address_data), + property_url=f"https://www.zillow.com{result['detailUrl']}", + style=home_info.get("homeType"), + status=home_info["statusType"].upper() + if "statusType" in home_info + else self.status, + list_price=home_info.get("price"), + beds=int(home_info["bedrooms"]) + if "bedrooms" in home_info + else None, + baths_full=home_info.get("bathrooms"), + est_sf=int(home_info["livingArea"]) + if "livingArea" in home_info + else None, + prc_sqft=int(home_info["price"] // home_info["livingArea"]) + if "livingArea" in home_info + and home_info["livingArea"] != 0 + and "price" in home_info + else None, + latitude=result["latLong"]["latitude"], + longitude=result["latLong"]["longitude"], + lot_sf=round(home_info["lotAreaValue"], 2) + if "lotAreaValue" in home_info + else None, + ) + + properties_list.append(property_obj) + + elif "isBuilding" in result: + price_string = ( + result["price"] + .replace("$", "") + .replace(",", "") + .replace("+/mo", "") + ) + + match = re.search(r"(\d+)", price_string) + price_value = int(match.group(1)) if match else None + building_obj = Property( + property_url=f"https://www.zillow.com{result['detailUrl']}", + style="BUILDING", + address=self._extract_address(result["address"]), + baths_full=result.get("minBaths"), + neighborhoods=result.get("communityName"), + list_price=price_value if "+/mo" in result.get("price") else None, + latitude=result.get("latLong", {}).get("latitude"), + longitude=result.get("latLong", {}).get("longitude"), + ) + + properties_list.append(building_obj) + + return properties_list + + def _get_single_property_page(self, property_data: dict): + """ + This method is used when a user enters the exact location & zillow returns just one property + """ + url = ( + f"https://www.zillow.com{property_data['hdpUrl']}" + if "zillow.com" not in property_data["hdpUrl"] + else property_data["hdpUrl"] + ) + address_data = property_data["address"] + address = Address( + street=address_data["streetAddress"], + city=address_data["city"], + state=address_data["state"], + zip=address_data["zipcode"], + ) + property_type = property_data.get("homeType", None) + return Property( + property_url=url, + status=self.status, + address=address, + yr_blt=property_data.get("yearBuilt"), + lot_sf=property_data.get("lotAreaValue"), + stories=property_data.get("resoFacts", {}).get("stories"), + mls_id=property_data.get("attributionInfo", {}).get("mlsId"), + beds=property_data.get("bedrooms"), + baths_full=property_data.get("bathrooms"), + list_price=property_data.get("price"), + est_sf=property_data.get("livingArea"), + prc_sqft=property_data.get("resoFacts", {}).get("pricePerSquareFoot"), + latitude=property_data.get("latitude"), + longitude=property_data.get("longitude"), + ) + + def _extract_address(self, address_str): + """ + Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX', + and return an Address object. + """ + parts = address_str.split(", ") + + if len(parts) != 3: + raise ValueError(f"Unexpected address format: {address_str}") + + address_one = parts[0].strip() + city = parts[1].strip() + state_zip = parts[2].split(" ") + + if len(state_zip) == 1: + state = state_zip[0].strip() + zip_code = None + elif len(state_zip) == 2: + state = state_zip[0].strip() + zip_code = state_zip[1].strip() + else: + raise ValueError(f"Unexpected state/zip format in address: {address_str}") + + return Address( + street=address_one, + city=city, + state=state, + zip=zip_code, + )