feat(scrapers): add zillow

This commit is contained in:
Cullen Watson
2023-09-17 15:06:31 -05:00
parent 2f3b012747
commit 2f5ea1ca88
11 changed files with 349 additions and 97 deletions

View File

@@ -1,6 +1,6 @@
from dataclasses import dataclass
import requests
from .types import Property, ListingType
from .models import Property, ListingType
@dataclass
@@ -11,9 +11,12 @@ class ScraperInput:
class Scraper:
listing_type = ListingType.FOR_SALE
def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location
self.session = requests.Session()
Scraper.listing_type = scraper_input.listing_type
if scraper_input.proxy_url:
self.session.proxies = {
@@ -21,9 +24,12 @@ class Scraper:
"https": scraper_input.proxy_url,
}
def search(self) -> list[Property]: ...
def search(self) -> list[Property]:
...
@staticmethod
def parse_home(home) -> Property: ...
def _parse_home(home) -> Property:
...
def handle_location(self): ...
def handle_location(self):
...

View File

@@ -24,14 +24,29 @@ class Property:
url: str
beds: int | None = None
baths: int | None = None
baths: float | None = None
stories: int | None = None
agent_name: str | None = None
description: str | None = None
year_built: int | None = None
square_feet: int | None = None
price_per_square_foot: int | None = None
year_built: int | None = None
price: int | None = None
mls_id: str | None = None
property_type: str | None = None
listing_type: ListingType | None = None
lot_size: int | None = None
description: str | None = None
@dataclass
class Building:
address: Address
url: str
num_units: int | None = None
min_unit_price: int | None = None
max_unit_price: int | None = None
avg_unit_price: int | None = None
listing_type: str | None = None

View File

@@ -1,5 +1,5 @@
import json
from ..types import Property, Address
from ..models import Property, Address
from .. import Scraper
from typing import Any
@@ -10,39 +10,42 @@ class RealtorScraper(Scraper):
def handle_location(self):
headers = {
'authority': 'parser-external.geo.moveaws.com',
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'origin': 'https://www.realtor.com',
'referer': 'https://www.realtor.com/',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.realtor.com",
"referer": "https://www.realtor.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}
params = {
'input': self.location,
'client_id': 'for-sale',
'limit': '1',
'area_types': 'city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park',
"input": self.location,
"client_id": "for-sale",
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
}
response = self.session.get('https://parser-external.geo.moveaws.com/suggest', params=params, headers=headers)
response = self.session.get(
"https://parser-external.geo.moveaws.com/suggest",
params=params,
headers=headers,
)
response_json = response.json()
return response_json['autocomplete'][0]
return response_json["autocomplete"][0]
def search(self):
location_info = self.handle_location()
location_type = location_info['area_type']
location_type = location_info["area_type"]
"""
property types:
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
"""
print('a')
print("a")

View File

@@ -1,5 +1,5 @@
import json
from ..types import Property, Address
from ..models import Property, Address
from .. import Scraper
from typing import Any
@@ -8,11 +8,13 @@ class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
def handle_location(self):
url = 'https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}'.format(self.location)
def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
def get_region_type(match_type: str):
if match_type == "4":
@@ -22,51 +24,53 @@ class RedfinScraper(Scraper):
elif match_type == "1":
return "address" #: address, needs to be handled differently
if response_json['payload']['exactMatch'] is not None:
target = response_json['payload']['exactMatch']
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
else:
target = response_json['payload']['sections'][0]['rows'][0]
target = response_json["payload"]["sections"][0]["rows"][0]
return target['id'].split('_')[1], get_region_type(target['type'])
return target["id"].split("_")[1], get_region_type(target["type"])
@staticmethod
def parse_home(home: dict, single_search: bool = False) -> Property:
def _parse_home(home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and 'value' in home[key]:
return home[key]['value']
if key in home and "value" in home[key]:
return home[key]["value"]
if not single_search:
address = Address(
address_one=get_value('streetLine'),
city=home['city'],
state=home['state'],
zip_code=home['zip']
address_one=get_value("streetLine"),
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)
else:
address_info = home['streetAddress']
address_info = home["streetAddress"]
address = Address(
address_one=address_info['assembledAddress'],
city=home['city'],
state=home['state'],
zip_code=home['zip']
address_one=address_info["assembledAddress"],
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)
url = 'https://www.redfin.com{}'.format(home['url'])
url = "https://www.redfin.com{}".format(home["url"])
return Property(
address=address,
url=url,
beds=home['beds'] if 'beds' in home else None,
baths=home['baths'] if 'baths' in home else None,
stories=home['stories'] if 'stories' in home else None,
agent_name=get_value('listingAgent'),
description=home['listingRemarks'] if 'listingRemarks' in home else None,
year_built=get_value('yearBuilt') if not single_search else home['yearBuilt'],
square_feet=get_value('sqFt'),
price_per_square_foot=get_value('pricePerSqFt'),
price=get_value('price'),
mls_id=get_value('mlsId')
beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt")
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
price_per_square_foot=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
)
def handle_address(self, home_id: str):
@@ -78,25 +82,33 @@ class RedfinScraper(Scraper):
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(home_id)
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
parsed_home = self.parse_home(response_json['payload']['addressSectionInfo'], single_search=True)
parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home]
def search(self):
region_id, region_type = self.handle_location()
region_id, region_type = self._handle_location()
if region_type == "address":
home_id = region_id
return self.handle_address(home_id)
url = 'https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}'.format(region_id, region_type)
url = "https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}".format(
region_id, region_type
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
homes = [self.parse_home(home) for home in response_json['payload']['homes']] #: support buildings
homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]
] #: support buildings
return homes

View File

@@ -0,0 +1,205 @@
import re
import json
from ..models import Property, Address, Building, ListingType
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper
class ZillowScraper(Scraper):
listing_type: ListingType.FOR_SALE
def __init__(self, scraper_input):
super().__init__(scraper_input)
if self.listing_type == ListingType.FOR_SALE:
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
elif self.listing_type == ListingType.FOR_RENT:
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
def search(self):
resp = self.session.get(self.url, headers=self._get_headers())
resp.raise_for_status()
content = resp.text
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
content,
re.DOTALL,
)
if not match:
raise NoResultsFound(
"No results were found for Zillow with the given Location."
)
json_str = match.group(1)
data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]:
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
"searchResults"
]["listResults"]
return [self._parse_home(house) for house in houses]
elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0]
property_data = gdp_client_cache[main_key]["property"]
property = self._get_single_property_page(property_data)
return [property]
raise PropertyNotFound("Specific property data not found in the response.")
@classmethod
def _parse_home(cls, home: dict):
"""
This method is used when a user enters a generic location & zillow returns more than one property
"""
url = (
f"https://www.zillow.com{home['detailUrl']}"
if "zillow.com" not in home["detailUrl"]
else home["detailUrl"]
)
if "hdpData" in home and "homeInfo" in home["hdpData"]:
price_data = cls._extract_price(home)
address = cls._extract_address(home)
agent_name = cls._extract_agent_name(home)
beds = home["hdpData"]["homeInfo"]["bedrooms"]
baths = home["hdpData"]["homeInfo"]["bathrooms"]
listing_type = home["hdpData"]["homeInfo"].get("homeType")
return Property(
address=address,
agent_name=agent_name,
url=url,
beds=beds,
baths=baths,
listing_type=listing_type,
**price_data,
)
else:
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
address_one, city, state, zip_code = (home[key] for key in keys)
address_one, address_two = cls._parse_address_two(address_one)
address = Address(address_one, city, state, zip_code, address_two)
building_info = cls._extract_building_info(home)
return Building(address=address, url=url, **building_info)
@classmethod
def _get_single_property_page(cls, property_data: dict):
"""
This method is used when a user enters the exact location & zillow returns just one property
"""
url = (
f"https://www.zillow.com{property_data['hdpUrl']}"
if "zillow.com" not in property_data["hdpUrl"]
else property_data["hdpUrl"]
)
address_data = property_data["address"]
address_one, address_two = cls._parse_address_two(address_data["streetAddress"])
address = Address(
address_one=address_one,
address_two=address_two,
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
)
return Property(
address=address,
url=url,
beds=property_data.get("bedrooms", None),
baths=property_data.get("bathrooms", None),
year_built=property_data.get("yearBuilt", None),
price=property_data.get("price", None),
lot_size=property_data.get("lotSize", None),
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
stories=property_data.get("resoFacts", {}).get("stories", None),
description=property_data.get("description", None),
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
price_per_square_foot=property_data.get("resoFacts", {}).get(
"pricePerSquareFoot", None
),
square_feet=property_data.get("livingArea", None),
listing_type=property_data.get("homeType", None),
)
@classmethod
def _extract_building_info(cls, home: dict) -> dict:
num_units = len(home["units"])
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
]
return {
"listing_type": cls.listing_type,
"num_units": len(home["units"]),
"min_unit_price": min(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"max_unit_price": max(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
}
@staticmethod
def _extract_price(home: dict) -> dict:
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
price_per_square_foot = price // square_feet if square_feet and price else None
return {
k: v
for k, v in locals().items()
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
}
@staticmethod
def _extract_agent_name(home: dict) -> str | None:
broker_str = home.get("brokerName", "")
match = re.search(r"Listing by: (.+)", broker_str)
return match.group(1) if match else None
@staticmethod
def _parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
address_two = apt_match.group().strip() if apt_match else None
address_one = (
address_one.replace(address_two, "").strip() if address_two else address_one
)
return address_one, address_two
@staticmethod
def _extract_address(home: dict) -> Address:
keys = ("streetAddress", "city", "state", "zipcode")
address_one, city, state, zip_code = (
home["hdpData"]["homeInfo"][key] for key in keys
)
address_one, address_two = ZillowScraper._parse_address_two(address_one)
return Address(address_one, city, state, zip_code, address_two=address_two)
@staticmethod
def _get_headers():
return {
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}