HomeHarvest/homeharvest/core/scrapers/redfin/__init__.py

115 lines
4.1 KiB
Python
Raw Normal View History

2023-09-15 15:42:47 -07:00
import json
2023-09-17 13:06:31 -07:00
from ..models import Property, Address
2023-09-15 15:17:37 -07:00
from .. import Scraper
2023-09-15 16:03:17 -07:00
from typing import Any
2023-09-15 15:17:37 -07:00
class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
2023-09-15 15:42:47 -07:00
2023-09-17 13:06:31 -07:00
def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)
2023-09-15 15:42:47 -07:00
response = self.session.get(url)
2023-09-17 13:06:31 -07:00
response_json = json.loads(response.text.replace("{}&&", ""))
2023-09-15 15:42:47 -07:00
def get_region_type(match_type: str):
if match_type == "4":
2023-09-16 14:34:10 -07:00
return "2" #: zip
elif match_type == "2":
2023-09-16 14:34:10 -07:00
return "6" #: city
elif match_type == "1":
return "address" #: address, needs to be handled differently
2023-09-17 13:06:31 -07:00
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
2023-09-15 15:42:47 -07:00
else:
2023-09-17 13:06:31 -07:00
target = response_json["payload"]["sections"][0]["rows"][0]
2023-09-17 13:06:31 -07:00
return target["id"].split("_")[1], get_region_type(target["type"])
2023-09-15 15:42:47 -07:00
@staticmethod
2023-09-17 13:06:31 -07:00
def _parse_home(home: dict, single_search: bool = False) -> Property:
2023-09-15 16:03:17 -07:00
def get_value(key: str) -> Any | None:
2023-09-17 13:06:31 -07:00
if key in home and "value" in home[key]:
return home[key]["value"]
2023-09-15 16:03:17 -07:00
2023-09-16 14:34:10 -07:00
if not single_search:
address = Address(
2023-09-17 13:06:31 -07:00
address_one=get_value("streetLine"),
city=home["city"],
state=home["state"],
zip_code=home["zip"],
2023-09-16 14:34:10 -07:00
)
else:
2023-09-17 13:06:31 -07:00
address_info = home["streetAddress"]
2023-09-16 14:34:10 -07:00
address = Address(
2023-09-17 13:06:31 -07:00
address_one=address_info["assembledAddress"],
city=home["city"],
state=home["state"],
zip_code=home["zip"],
2023-09-16 14:34:10 -07:00
)
2023-09-17 13:06:31 -07:00
url = "https://www.redfin.com{}".format(home["url"])
2023-09-16 14:34:10 -07:00
2023-09-16 10:11:39 -07:00
return Property(
2023-09-15 16:03:17 -07:00
address=address,
url=url,
2023-09-17 13:06:31 -07:00
beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt")
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
price_per_square_foot=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
2023-09-15 16:03:17 -07:00
)
2023-09-15 15:42:47 -07:00
2023-09-16 14:34:10 -07:00
def handle_address(self, home_id: str):
"""
EPs:
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
2023-09-17 13:06:31 -07:00
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
2023-09-16 14:34:10 -07:00
response = self.session.get(url)
2023-09-17 13:06:31 -07:00
response_json = json.loads(response.text.replace("{}&&", ""))
2023-09-16 14:34:10 -07:00
2023-09-17 13:06:31 -07:00
parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
2023-09-16 14:34:10 -07:00
return [parsed_home]
2023-09-15 15:42:47 -07:00
def search(self):
2023-09-17 13:06:31 -07:00
region_id, region_type = self._handle_location()
2023-09-15 15:42:47 -07:00
2023-09-16 14:34:10 -07:00
if region_type == "address":
home_id = region_id
return self.handle_address(home_id)
2023-09-17 13:06:31 -07:00
url = "https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}".format(
region_id, region_type
)
2023-09-15 15:42:47 -07:00
response = self.session.get(url)
2023-09-17 13:06:31 -07:00
response_json = json.loads(response.text.replace("{}&&", ""))
2023-09-15 15:42:47 -07:00
2023-09-17 13:06:31 -07:00
homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]
] #: support buildings
2023-09-15 15:42:47 -07:00
return homes