HomeHarvest/homeharvest/core/scrapers/redfin/__init__.py

234 lines
9.8 KiB
Python
Raw Normal View History

2023-09-19 19:13:20 -07:00
"""
homeharvest.redfin.__init__
~~~~~~~~~~~~
This module implements the scraper for redfin.com
"""
2023-09-15 15:42:47 -07:00
import json
2023-09-15 16:03:17 -07:00
from typing import Any
from .. import Scraper
2023-09-19 19:13:20 -07:00
from ....utils import parse_address_two, parse_address_one
2023-09-19 09:58:20 -07:00
from ..models import Property, Address, PropertyType, ListingType, SiteName
from ....exceptions import NoResultsFound
2023-09-15 15:17:37 -07:00
class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
2023-09-17 16:30:37 -07:00
self.listing_type = scraper_input.listing_type
2023-09-15 15:42:47 -07:00
2023-09-17 13:06:31 -07:00
def _handle_location(self):
2023-09-19 19:13:20 -07:00
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(self.location)
2023-09-15 15:42:47 -07:00
response = self.session.get(url)
2023-09-17 13:06:31 -07:00
response_json = json.loads(response.text.replace("{}&&", ""))
2023-09-15 15:42:47 -07:00
def get_region_type(match_type: str):
if match_type == "4":
2023-09-16 14:34:10 -07:00
return "2" #: zip
elif match_type == "2":
2023-09-16 14:34:10 -07:00
return "6" #: city
elif match_type == "1":
return "address" #: address, needs to be handled differently
2023-09-18 20:59:49 -07:00
if "exactMatch" not in response_json["payload"]:
2023-09-19 19:13:20 -07:00
raise NoResultsFound("No results found for location: {}".format(self.location))
2023-09-17 13:06:31 -07:00
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
2023-09-15 15:42:47 -07:00
else:
2023-09-17 13:06:31 -07:00
target = response_json["payload"]["sections"][0]["rows"][0]
2023-09-17 13:06:31 -07:00
return target["id"].split("_")[1], get_region_type(target["type"])
2023-09-15 15:42:47 -07:00
2023-09-17 16:30:37 -07:00
def _parse_home(self, home: dict, single_search: bool = False) -> Property:
2023-09-15 16:03:17 -07:00
def get_value(key: str) -> Any | None:
2023-09-17 13:06:31 -07:00
if key in home and "value" in home[key]:
return home[key]["value"]
2023-09-15 16:03:17 -07:00
2023-09-16 14:34:10 -07:00
if not single_search:
address = Address(
2023-09-19 19:13:20 -07:00
address_one=parse_address_one(get_value("streetLine"))[0],
address_two=parse_address_one(get_value("streetLine"))[1],
2023-09-19 14:43:17 -07:00
city=home.get("city"),
state=home.get("state"),
zip_code=home.get("zip"),
2023-09-16 14:34:10 -07:00
)
else:
2023-09-19 14:43:17 -07:00
address_info = home.get("streetAddress")
2023-09-19 19:13:20 -07:00
address_one, address_two = parse_address_one(address_info.get("assembledAddress"))
2023-09-16 14:34:10 -07:00
address = Address(
2023-09-19 19:13:20 -07:00
address_one=address_one,
address_two=address_two,
2023-09-19 14:43:17 -07:00
city=home.get("city"),
state=home.get("state"),
zip_code=home.get("zip"),
2023-09-16 14:34:10 -07:00
)
2023-09-18 16:22:47 -07:00
2023-09-17 13:06:31 -07:00
url = "https://www.redfin.com{}".format(home["url"])
2023-09-17 16:52:34 -07:00
lot_size_data = home.get("lotSize")
2023-09-18 16:22:47 -07:00
2023-09-17 16:52:34 -07:00
if not isinstance(lot_size_data, int):
2023-09-19 19:13:20 -07:00
lot_size = lot_size_data.get("value", None) if isinstance(lot_size_data, dict) else None
2023-09-17 16:52:34 -07:00
else:
lot_size = lot_size_data
2023-09-16 14:34:10 -07:00
2023-09-16 10:11:39 -07:00
return Property(
2023-09-17 16:30:37 -07:00
site_name=self.site_name,
listing_type=self.listing_type,
2023-09-15 16:03:17 -07:00
address=address,
property_url=url,
2023-09-19 19:13:20 -07:00
beds_min=home["beds"] if "beds" in home else None,
beds_max=home["beds"] if "beds" in home else None,
baths_min=home["baths"] if "baths" in home else None,
baths_max=home["baths"] if "baths" in home else None,
price_min=get_value("price"),
price_max=get_value("price"),
sqft_min=get_value("sqFt"),
sqft_max=get_value("sqFt"),
2023-09-17 13:06:31 -07:00
stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None,
2023-09-21 18:54:03 -07:00
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
lot_area_value=lot_size,
2023-09-17 16:30:37 -07:00
property_type=PropertyType.from_int_code(home.get("propertyType")),
2023-09-21 11:27:12 -07:00
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
2023-09-17 13:06:31 -07:00
mls_id=get_value("mlsId"),
2023-09-19 19:13:20 -07:00
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None,
2023-09-15 16:03:17 -07:00
)
2023-09-15 15:42:47 -07:00
2023-09-19 09:58:20 -07:00
def _handle_rentals(self, region_id, region_type):
2023-09-19 10:37:10 -07:00
url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true&region_id={region_id}&region_type={region_type}&num_homes=100000"
2023-09-19 09:58:20 -07:00
response = self.session.get(url)
2023-09-19 10:27:13 -07:00
response.raise_for_status()
2023-09-19 09:58:20 -07:00
homes = response.json()
properties_list = []
for home in homes["homes"]:
home_data = home["homeData"]
rental_data = home["rentalExtension"]
property_url = f"https://www.redfin.com{home_data.get('url', '')}"
address_info = home_data.get("addressInfo", {})
centroid = address_info.get("centroid", {}).get("centroid", {})
address = Address(
2023-09-19 19:13:20 -07:00
address_one=parse_address_one(address_info.get("formattedStreetLine"))[0],
city=address_info.get("city"),
state=address_info.get("state"),
zip_code=address_info.get("zip"),
2023-09-19 09:58:20 -07:00
)
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
bed_range = rental_data.get("bedRange", {"min": None, "max": None})
bath_range = rental_data.get("bathRange", {"min": None, "max": None})
sqft_range = rental_data.get("sqftRange", {"min": None, "max": None})
property_ = Property(
property_url=property_url,
site_name=SiteName.REDFIN,
listing_type=ListingType.FOR_RENT,
address=address,
2023-09-19 19:13:20 -07:00
description=rental_data.get("description"),
latitude=centroid.get("latitude"),
longitude=centroid.get("longitude"),
baths_min=bath_range.get("min"),
baths_max=bath_range.get("max"),
beds_min=bed_range.get("min"),
beds_max=bed_range.get("max"),
price_min=price_range.get("min"),
price_max=price_range.get("max"),
sqft_min=sqft_range.get("min"),
sqft_max=sqft_range.get("max"),
img_src=home_data.get("staticMapUrl"),
posted_time=rental_data.get("lastUpdated"),
bldg_name=rental_data.get("propertyName"),
2023-09-19 09:58:20 -07:00
)
properties_list.append(property_)
if not properties_list:
raise NoResultsFound("No rentals found for the given location.")
return properties_list
2023-09-18 12:36:18 -07:00
def _parse_building(self, building: dict) -> Property:
2023-09-18 15:42:16 -07:00
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
2023-09-18 12:36:18 -07:00
return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
2023-09-18 08:26:35 -07:00
address=Address(
2023-09-19 19:13:20 -07:00
address_one=parse_address_one(street_address)[0],
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"],
2023-09-19 19:13:20 -07:00
address_two=parse_address_two(
2023-09-18 15:42:16 -07:00
" ".join(
[
building["address"]["unitType"],
building["address"]["unitValue"],
]
)
),
2023-09-18 08:26:35 -07:00
),
2023-09-18 12:36:18 -07:00
property_url="https://www.redfin.com{}".format(building["url"]),
2023-09-18 08:26:35 -07:00
listing_type=self.listing_type,
2023-09-21 11:27:12 -07:00
unit_count=building.get("numUnitsForSale"),
2023-09-18 08:26:35 -07:00
)
2023-09-16 14:34:10 -07:00
def handle_address(self, home_id: str):
"""
EPs:
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
2023-09-17 13:06:31 -07:00
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
2023-09-16 14:34:10 -07:00
response = self.session.get(url)
2023-09-17 13:06:31 -07:00
response_json = json.loads(response.text.replace("{}&&", ""))
2023-09-16 14:34:10 -07:00
2023-09-19 19:13:20 -07:00
parsed_home = self._parse_home(response_json["payload"]["addressSectionInfo"], single_search=True)
2023-09-16 14:34:10 -07:00
return [parsed_home]
2023-09-15 15:42:47 -07:00
def search(self):
2023-09-17 13:06:31 -07:00
region_id, region_type = self._handle_location()
2023-09-15 15:42:47 -07:00
2023-09-16 14:34:10 -07:00
if region_type == "address":
home_id = region_id
return self.handle_address(home_id)
2023-09-19 09:58:20 -07:00
if self.listing_type == ListingType.FOR_RENT:
return self._handle_rentals(region_id, region_type)
2023-09-19 10:27:13 -07:00
else:
if self.listing_type == ListingType.FOR_SALE:
2023-09-19 10:37:10 -07:00
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&num_homes=100000"
2023-09-19 10:27:13 -07:00
else:
2023-09-19 10:37:10 -07:00
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
2023-09-19 09:58:20 -07:00
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
2023-09-21 09:18:37 -07:00
if "payload" in response_json:
homes_list = response_json["payload"].get("homes", [])
buildings_list = response_json["payload"].get("buildings", {}).values()
homes = [self._parse_home(home) for home in homes_list] + [
self._parse_building(building) for building in buildings_list
]
return homes
else:
return []