[fix] add back zillow/redfin

pull/30/head
Cullen Watson 2023-10-03 22:15:07 -05:00
parent bd33c3b5a4
commit ff95ca0611
5 changed files with 573 additions and 11 deletions

View File

@ -24,7 +24,12 @@ def _validate_input(site_name: str, status: str) -> None:
def _scrape_single_site( def _scrape_single_site(
location: str, site_name: str, status: str, proxy: str = None, timeframe: str = None location: str,
site_name: str,
status: str,
radius: float,
proxy: str = None,
timeframe: str = None,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Helper function to scrape a single site. Helper function to scrape a single site.
@ -36,6 +41,7 @@ def _scrape_single_site(
status=status, status=status,
site_name=SiteName.get_by_value(site_name.lower()), site_name=SiteName.get_by_value(site_name.lower()),
proxy=proxy, proxy=proxy,
radius=radius,
timeframe=timeframe, timeframe=timeframe,
) )
@ -53,7 +59,8 @@ def scrape_property(
location: str, location: str,
timeframe: str = None, timeframe: str = None,
site_name: Union[str, list[str]] = None, site_name: Union[str, list[str]] = None,
status: str = "sale", listing_type: str = "for_sale",
radius: float = None,
proxy: str = None, proxy: str = None,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
@ -65,6 +72,7 @@ def scrape_property(
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold') :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties :return: pd.DataFrame containing properties
""" """
status = listing_type
if site_name is None: if site_name is None:
site_name = list(_scrapers.keys()) site_name = list(_scrapers.keys())
@ -80,7 +88,13 @@ def scrape_property(
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
futures = { futures = {
executor.submit( executor.submit(
_scrape_single_site, location, s_name, status, proxy, timeframe _scrape_single_site,
location,
s_name,
status,
radius,
proxy,
timeframe,
): s_name ): s_name
for s_name in site_name for s_name in site_name
} }

View File

@ -16,6 +16,7 @@ class ScraperInput:
site_name: str site_name: str
proxy: Optional[str] = None proxy: Optional[str] = None
timeframe: Optional[str] = None timeframe: Optional[str] = None
radius: float | None = None
def __post_init__(self): def __post_init__(self):
if self.status == "sold" and not self.timeframe: if self.status == "sold" and not self.timeframe:
@ -50,6 +51,7 @@ class Scraper:
self.listing_type = scraper_input.status self.listing_type = scraper_input.status
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name
self.radius = scraper_input.radius
def search(self) -> list[Property]: def search(self) -> list[Property]:
... ...

View File

@ -590,12 +590,15 @@ class RealtorScraper(Scraper):
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]
is_for_comps = self.radius is not None and location_type == "address"
if location_type == "address": if location_type == "address" and not is_for_comps:
property_id = location_info["mpr_id"] property_id = location_info["mpr_id"]
return self.handle_address(property_id) return self.handle_address(property_id)
offset = 0 offset = 0
if not is_for_comps:
search_variables = { search_variables = {
"city": location_info.get("city"), "city": location_info.get("city"),
"county": location_info.get("county"), "county": location_info.get("county"),
@ -603,6 +606,13 @@ class RealtorScraper(Scraper):
"postal_code": location_info.get("postal_code"), "postal_code": location_info.get("postal_code"),
"offset": offset, "offset": offset,
} }
else:
coordinates = list(location_info["centroid"].values())
search_variables = {
"coordinates": coordinates,
"radius": "{}mi".format(self.radius),
"offset": offset,
}
result = self.handle_area(search_variables) result = self.handle_area(search_variables)
total = result["total"] total = result["total"]

View File

@ -0,0 +1,228 @@
"""
homeharvest.redfin.__init__
~~~~~~~~~~~~
This module implements the scraper for redfin.com
"""
import json
from typing import Any
from .. import Scraper
from ..models import Property, Address, Status
from ....exceptions import NoResultsFound, SearchTooBroad
from datetime import datetime
class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
self.listing_type = scraper_input.status
def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
def get_region_type(match_type: str):
if match_type == "4":
return "2" #: zip
elif match_type == "2":
return "6" #: city
elif match_type == "1":
return "address" #: address, needs to be handled differently
elif match_type == "11":
return "state"
if "exactMatch" not in response_json["payload"]:
raise NoResultsFound(
"No results found for location: {}".format(self.location)
)
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
else:
target = response_json["payload"]["sections"][0]["rows"][0]
return target["id"].split("_")[1], get_region_type(target["type"])
def _parse_home(self, home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and "value" in home[key]:
return home[key]["value"]
if not single_search:
address = Address(
street=get_value("streetLine"),
city=home.get("city"),
state=home.get("state"),
zip=home.get("zip"),
)
else:
address_info = home.get("streetAddress")
address = Address(
street=address_info.get("assembledAddress"),
city=home.get("city"),
state=home.get("state"),
zip=home.get("zip"),
)
url = "https://www.redfin.com{}".format(home["url"])
lot_size_data = home.get("lotSize")
if not isinstance(lot_size_data, int):
lot_size = (
lot_size_data.get("value", None)
if isinstance(lot_size_data, dict)
else None
)
else:
lot_size = lot_size_data
lat_long = get_value("latLong")
return Property(
status=self.listing_type,
address=address,
property_url=url,
beds=home["beds"] if "beds" in home else None,
baths_full=home["baths"] if "baths" in home else None,
list_price=get_value("price"),
est_sf=get_value("sqFt"),
stories=home["stories"] if "stories" in home else None,
yr_blt=get_value("yearBuilt")
if not single_search
else home.get("yearBuilt"),
lot_sf=lot_size,
prc_sqft=get_value("pricePerSqFt")
if type(home.get("pricePerSqFt")) != int
else home.get("pricePerSqFt"),
mls_id=get_value("mlsId"),
latitude=lat_long.get("latitude") if lat_long else None,
longitude=lat_long.get("longitude") if lat_long else None,
last_sold_date=datetime.fromtimestamp(home["soldDate"] / 1000)
if "soldDate" in home
else None,
)
def _handle_rentals(self, region_id, region_type):
url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true&region_id={region_id}&region_type={region_type}&num_homes=100000"
response = self.session.get(url)
response.raise_for_status()
homes = response.json()
properties_list = []
for home in homes["homes"]:
home_data = home["homeData"]
rental_data = home["rentalExtension"]
property_url = f"https://www.redfin.com{home_data.get('url', '')}"
address_info = home_data.get("addressInfo", {})
centroid = address_info.get("centroid", {}).get("centroid", {})
address = Address(
street=address_info.get("formattedStreetLine"),
city=address_info.get("city"),
state=address_info.get("state"),
zip=address_info.get("zip"),
)
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
bed_range = rental_data.get("bedRange", {"min": None, "max": None})
bath_range = rental_data.get("bathRange", {"min": None, "max": None})
sqft_range = rental_data.get("sqftRange", {"min": None, "max": None})
property_ = Property(
property_url=property_url,
status=Status.FOR_RENT.value,
address=address,
latitude=centroid.get("latitude"),
longitude=centroid.get("longitude"),
baths_full=bath_range.get("min"),
beds=bed_range.get("min"),
list_price=price_range.get("min"),
est_sf=sqft_range.get("min"),
)
properties_list.append(property_)
if not properties_list:
raise NoResultsFound("No rentals found for the given location.")
return properties_list
def _parse_building(self, building: dict) -> Property:
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
return Property(
status=self.status,
address=Address(
street=street_address,
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip=building["address"]["postalCode"],
),
property_url="https://www.redfin.com{}".format(building["url"]),
)
def handle_address(self, home_id: str):
"""
EPs:
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home]
def search(self):
region_id, region_type = self._handle_location()
if region_type == "state":
raise SearchTooBroad(
"State searches are not supported, please use a more specific location."
)
if region_type == "address":
home_id = region_id
return self.handle_address(home_id)
if self.listing_type == Status.FOR_RENT:
return self._handle_rentals(region_id, region_type)
else:
if self.listing_type == Status.FOR_SALE:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&num_homes=100000"
else:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
if "payload" in response_json:
homes_list = response_json["payload"].get("homes", [])
buildings_list = response_json["payload"].get("buildings", {}).values()
homes = [self._parse_home(home) for home in homes_list] + [
self._parse_building(building) for building in buildings_list
]
return homes
else:
return []

View File

@ -0,0 +1,308 @@
"""
homeharvest.zillow.__init__
~~~~~~~~~~~~
This module implements the scraper for zillow.com
"""
import re
import json
import tls_client
from .. import Scraper
from requests.exceptions import HTTPError
from ....exceptions import GeoCoordsNotFound, NoResultsFound
from ..models import Property, Address, Status
import urllib.parse
from datetime import datetime, timedelta
class ZillowScraper(Scraper):
def __init__(self, scraper_input):
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
super().__init__(scraper_input, session)
self.session.headers.update(
{
"authority": "www.zillow.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
}
)
if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location))
listing_type_to_url_path = {
Status.FOR_SALE: "for_sale",
Status.FOR_RENT: "for_rent",
Status.SOLD: "recently_sold",
}
self.url = f"https://www.zillow.com/homes/{listing_type_to_url_path[self.listing_type]}/{self.location}_rb/"
def is_plausible_location(self, location: str) -> bool:
url = (
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
).format(urllib.parse.quote(location))
resp = self.session.get(url)
return resp.json()["results"] != []
def search(self):
resp = self.session.get(self.url)
if resp.status_code != 200:
raise HTTPError(f"bad response status code: {resp.status_code}")
content = resp.text
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
content,
re.DOTALL,
)
if not match:
raise NoResultsFound(
"No results were found for Zillow with the given Location."
)
json_str = match.group(1)
data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]:
pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};'
match = re.search(pattern, content)
if match:
coords = [float(coord) for coord in match.groups()]
return self._fetch_properties_backend(coords)
else:
raise GeoCoordsNotFound("Box bounds could not be located.")
elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0]
property_data = gdp_client_cache[main_key]["property"]
property = self._get_single_property_page(property_data)
return [property]
raise NoResultsFound("Specific property data not found in the response.")
def _fetch_properties_backend(self, coords):
url = "https://www.zillow.com/async-create-search-page-state"
filter_state_for_sale = {
"sortSelection": {
# "value": "globalrelevanceex"
"value": "days"
},
"isAllHomes": {"value": True},
}
filter_state_for_rent = {
"isForRent": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
filter_state_sold = {
"isRecentlySold": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
selected_filter = (
filter_state_for_rent
if self.listing_type == Status.FOR_RENT
else filter_state_for_sale
if self.listing_type == Status.FOR_SALE
else filter_state_sold
)
payload = {
"searchQueryState": {
"pagination": {},
"isMapVisible": True,
"mapBounds": {
"west": coords[0],
"east": coords[1],
"south": coords[2],
"north": coords[3],
},
"filterState": selected_filter,
"isListVisible": True,
"mapZoom": 11,
},
"wants": {"cat1": ["mapResults"]},
"isDebugRequest": False,
}
resp = self.session.put(url, json=payload)
if resp.status_code != 200:
raise HTTPError(f"bad response status code: {resp.status_code}")
return self._parse_properties(resp.json())
@staticmethod
def parse_posted_time(time: str) -> datetime:
int_time = int(time.split(" ")[0])
if "hour" in time:
return datetime.now() - timedelta(hours=int_time)
if "day" in time:
return datetime.now() - timedelta(days=int_time)
def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
properties_list = []
for result in mapresults:
if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"]
address_data = {
"streeet": home_info.get("streetAddress"),
"city": home_info.get("city"),
"state": home_info.get("state"),
"zip": home_info.get("zipcode"),
}
property_obj = Property(
address=Address(**address_data),
property_url=f"https://www.zillow.com{result['detailUrl']}",
style=home_info.get("homeType"),
status=home_info["statusType"].upper()
if "statusType" in home_info
else self.status,
list_price=home_info.get("price"),
beds=int(home_info["bedrooms"])
if "bedrooms" in home_info
else None,
baths_full=home_info.get("bathrooms"),
est_sf=int(home_info["livingArea"])
if "livingArea" in home_info
else None,
prc_sqft=int(home_info["price"] // home_info["livingArea"])
if "livingArea" in home_info
and home_info["livingArea"] != 0
and "price" in home_info
else None,
latitude=result["latLong"]["latitude"],
longitude=result["latLong"]["longitude"],
lot_sf=round(home_info["lotAreaValue"], 2)
if "lotAreaValue" in home_info
else None,
)
properties_list.append(property_obj)
elif "isBuilding" in result:
price_string = (
result["price"]
.replace("$", "")
.replace(",", "")
.replace("+/mo", "")
)
match = re.search(r"(\d+)", price_string)
price_value = int(match.group(1)) if match else None
building_obj = Property(
property_url=f"https://www.zillow.com{result['detailUrl']}",
style="BUILDING",
address=self._extract_address(result["address"]),
baths_full=result.get("minBaths"),
neighborhoods=result.get("communityName"),
list_price=price_value if "+/mo" in result.get("price") else None,
latitude=result.get("latLong", {}).get("latitude"),
longitude=result.get("latLong", {}).get("longitude"),
)
properties_list.append(building_obj)
return properties_list
def _get_single_property_page(self, property_data: dict):
"""
This method is used when a user enters the exact location & zillow returns just one property
"""
url = (
f"https://www.zillow.com{property_data['hdpUrl']}"
if "zillow.com" not in property_data["hdpUrl"]
else property_data["hdpUrl"]
)
address_data = property_data["address"]
address = Address(
street=address_data["streetAddress"],
city=address_data["city"],
state=address_data["state"],
zip=address_data["zipcode"],
)
property_type = property_data.get("homeType", None)
return Property(
property_url=url,
status=self.status,
address=address,
yr_blt=property_data.get("yearBuilt"),
lot_sf=property_data.get("lotAreaValue"),
stories=property_data.get("resoFacts", {}).get("stories"),
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
beds=property_data.get("bedrooms"),
baths_full=property_data.get("bathrooms"),
list_price=property_data.get("price"),
est_sf=property_data.get("livingArea"),
prc_sqft=property_data.get("resoFacts", {}).get("pricePerSquareFoot"),
latitude=property_data.get("latitude"),
longitude=property_data.get("longitude"),
)
def _extract_address(self, address_str):
"""
Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
and return an Address object.
"""
parts = address_str.split(", ")
if len(parts) != 3:
raise ValueError(f"Unexpected address format: {address_str}")
address_one = parts[0].strip()
city = parts[1].strip()
state_zip = parts[2].split(" ")
if len(state_zip) == 1:
state = state_zip[0].strip()
zip_code = None
elif len(state_zip) == 2:
state = state_zip[0].strip()
zip_code = state_zip[1].strip()
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
return Address(
street=address_one,
city=city,
state=state,
zip=zip_code,
)