feat(scrapers): add zillow

pull/1/head
Cullen Watson 2023-09-17 15:06:31 -05:00
parent 2f3b012747
commit 2f5ea1ca88
11 changed files with 349 additions and 97 deletions

5
.gitignore vendored
View File

@ -1,2 +1,5 @@
/.idea /.idea
dist **/dist/
**/__pycache__/
**/.pytest_cache/
*.pyc

View File

@ -1,13 +1,16 @@
from .core.scrapers.redfin import RedfinScraper from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.types import ListingType, Property from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType from .exceptions import InvalidSite, InvalidListingType
from typing import Union
_scrapers = { _scrapers = {
"redfin": RedfinScraper, "redfin": RedfinScraper,
"realtor.com": RealtorScraper "realtor.com": RealtorScraper,
"zillow": ZillowScraper,
} }
@ -15,12 +18,14 @@ def scrape_property(
location: str, location: str,
site_name: str, site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> list[Property]: #: eventually, return pandas dataframe ) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe
if site_name.lower() not in _scrapers: if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.") raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
if listing_type.upper() not in ListingType.__members__: if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.") raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist."
)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,

View File

@ -1,6 +1,6 @@
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
from .types import Property, ListingType from .models import Property, ListingType
@dataclass @dataclass
@ -11,9 +11,12 @@ class ScraperInput:
class Scraper: class Scraper:
listing_type = ListingType.FOR_SALE
def __init__(self, scraper_input: ScraperInput): def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location self.location = scraper_input.location
self.session = requests.Session() self.session = requests.Session()
Scraper.listing_type = scraper_input.listing_type
if scraper_input.proxy_url: if scraper_input.proxy_url:
self.session.proxies = { self.session.proxies = {
@ -21,9 +24,12 @@ class Scraper:
"https": scraper_input.proxy_url, "https": scraper_input.proxy_url,
} }
def search(self) -> list[Property]: ... def search(self) -> list[Property]:
...
@staticmethod @staticmethod
def parse_home(home) -> Property: ... def _parse_home(home) -> Property:
...
def handle_location(self): ... def handle_location(self):
...

View File

@ -24,14 +24,29 @@ class Property:
url: str url: str
beds: int | None = None beds: int | None = None
baths: int | None = None baths: float | None = None
stories: int | None = None stories: int | None = None
agent_name: str | None = None agent_name: str | None = None
description: str | None = None
year_built: int | None = None year_built: int | None = None
square_feet: int | None = None square_feet: int | None = None
price_per_square_foot: int | None = None price_per_square_foot: int | None = None
year_built: int | None = None
price: int | None = None price: int | None = None
mls_id: str | None = None mls_id: str | None = None
property_type: str | None = None listing_type: ListingType | None = None
lot_size: int | None = None
description: str | None = None
@dataclass
class Building:
address: Address
url: str
num_units: int | None = None
min_unit_price: int | None = None
max_unit_price: int | None = None
avg_unit_price: int | None = None
listing_type: str | None = None

View File

@ -1,5 +1,5 @@
import json import json
from ..types import Property, Address from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any from typing import Any
@ -10,39 +10,42 @@ class RealtorScraper(Scraper):
def handle_location(self): def handle_location(self):
headers = { headers = {
'authority': 'parser-external.geo.moveaws.com', "authority": "parser-external.geo.moveaws.com",
'accept': '*/*', "accept": "*/*",
'accept-language': 'en-US,en;q=0.9', "accept-language": "en-US,en;q=0.9",
'origin': 'https://www.realtor.com', "origin": "https://www.realtor.com",
'referer': 'https://www.realtor.com/', "referer": "https://www.realtor.com/",
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile': '?0', "sec-ch-ua-mobile": "?0",
'sec-ch-ua-platform': '"Windows"', "sec-ch-ua-platform": '"Windows"',
'sec-fetch-dest': 'empty', "sec-fetch-dest": "empty",
'sec-fetch-mode': 'cors', "sec-fetch-mode": "cors",
'sec-fetch-site': 'cross-site', "sec-fetch-site": "cross-site",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
} }
params = { params = {
'input': self.location, "input": self.location,
'client_id': 'for-sale', "client_id": "for-sale",
'limit': '1', "limit": "1",
'area_types': 'city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park', "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
} }
response = self.session.get('https://parser-external.geo.moveaws.com/suggest', params=params, headers=headers) response = self.session.get(
"https://parser-external.geo.moveaws.com/suggest",
params=params,
headers=headers,
)
response_json = response.json() response_json = response.json()
return response_json['autocomplete'][0] return response_json["autocomplete"][0]
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info['area_type'] location_type = location_info["area_type"]
""" """
property types: property types:
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
""" """
print('a') print("a")

View File

@ -1,5 +1,5 @@
import json import json
from ..types import Property, Address from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any from typing import Any
@ -8,11 +8,13 @@ class RedfinScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
def handle_location(self): def _handle_location(self):
url = 'https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}'.format(self.location) url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', '')) response_json = json.loads(response.text.replace("{}&&", ""))
def get_region_type(match_type: str): def get_region_type(match_type: str):
if match_type == "4": if match_type == "4":
@ -22,51 +24,53 @@ class RedfinScraper(Scraper):
elif match_type == "1": elif match_type == "1":
return "address" #: address, needs to be handled differently return "address" #: address, needs to be handled differently
if response_json['payload']['exactMatch'] is not None: if response_json["payload"]["exactMatch"] is not None:
target = response_json['payload']['exactMatch'] target = response_json["payload"]["exactMatch"]
else: else:
target = response_json['payload']['sections'][0]['rows'][0] target = response_json["payload"]["sections"][0]["rows"][0]
return target['id'].split('_')[1], get_region_type(target['type']) return target["id"].split("_")[1], get_region_type(target["type"])
@staticmethod @staticmethod
def parse_home(home: dict, single_search: bool = False) -> Property: def _parse_home(home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None: def get_value(key: str) -> Any | None:
if key in home and 'value' in home[key]: if key in home and "value" in home[key]:
return home[key]['value'] return home[key]["value"]
if not single_search: if not single_search:
address = Address( address = Address(
address_one=get_value('streetLine'), address_one=get_value("streetLine"),
city=home['city'], city=home["city"],
state=home['state'], state=home["state"],
zip_code=home['zip'] zip_code=home["zip"],
) )
else: else:
address_info = home['streetAddress'] address_info = home["streetAddress"]
address = Address( address = Address(
address_one=address_info['assembledAddress'], address_one=address_info["assembledAddress"],
city=home['city'], city=home["city"],
state=home['state'], state=home["state"],
zip_code=home['zip'] zip_code=home["zip"],
) )
url = 'https://www.redfin.com{}'.format(home['url']) url = "https://www.redfin.com{}".format(home["url"])
return Property( return Property(
address=address, address=address,
url=url, url=url,
beds=home['beds'] if 'beds' in home else None, beds=home["beds"] if "beds" in home else None,
baths=home['baths'] if 'baths' in home else None, baths=home["baths"] if "baths" in home else None,
stories=home['stories'] if 'stories' in home else None, stories=home["stories"] if "stories" in home else None,
agent_name=get_value('listingAgent'), agent_name=get_value("listingAgent"),
description=home['listingRemarks'] if 'listingRemarks' in home else None, description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value('yearBuilt') if not single_search else home['yearBuilt'], year_built=get_value("yearBuilt")
square_feet=get_value('sqFt'), if not single_search
price_per_square_foot=get_value('pricePerSqFt'), else home["yearBuilt"],
price=get_value('price'), square_feet=get_value("sqFt"),
mls_id=get_value('mlsId') price_per_square_foot=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
) )
def handle_address(self, home_id: str): def handle_address(self, home_id: str):
@ -78,25 +82,33 @@ class RedfinScraper(Scraper):
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3 https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
""" """
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(home_id) url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', '')) response_json = json.loads(response.text.replace("{}&&", ""))
parsed_home = self.parse_home(response_json['payload']['addressSectionInfo'], single_search=True) parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home] return [parsed_home]
def search(self): def search(self):
region_id, region_type = self.handle_location() region_id, region_type = self._handle_location()
if region_type == "address": if region_type == "address":
home_id = region_id home_id = region_id
return self.handle_address(home_id) return self.handle_address(home_id)
url = 'https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}'.format(region_id, region_type) url = "https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}".format(
region_id, region_type
)
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', '')) response_json = json.loads(response.text.replace("{}&&", ""))
homes = [self.parse_home(home) for home in response_json['payload']['homes']] #: support buildings homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]
] #: support buildings
return homes return homes

View File

@ -0,0 +1,205 @@
import re
import json
from ..models import Property, Address, Building, ListingType
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper
class ZillowScraper(Scraper):
listing_type: ListingType.FOR_SALE
def __init__(self, scraper_input):
super().__init__(scraper_input)
if self.listing_type == ListingType.FOR_SALE:
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
elif self.listing_type == ListingType.FOR_RENT:
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
def search(self):
resp = self.session.get(self.url, headers=self._get_headers())
resp.raise_for_status()
content = resp.text
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
content,
re.DOTALL,
)
if not match:
raise NoResultsFound(
"No results were found for Zillow with the given Location."
)
json_str = match.group(1)
data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]:
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
"searchResults"
]["listResults"]
return [self._parse_home(house) for house in houses]
elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0]
property_data = gdp_client_cache[main_key]["property"]
property = self._get_single_property_page(property_data)
return [property]
raise PropertyNotFound("Specific property data not found in the response.")
@classmethod
def _parse_home(cls, home: dict):
"""
This method is used when a user enters a generic location & zillow returns more than one property
"""
url = (
f"https://www.zillow.com{home['detailUrl']}"
if "zillow.com" not in home["detailUrl"]
else home["detailUrl"]
)
if "hdpData" in home and "homeInfo" in home["hdpData"]:
price_data = cls._extract_price(home)
address = cls._extract_address(home)
agent_name = cls._extract_agent_name(home)
beds = home["hdpData"]["homeInfo"]["bedrooms"]
baths = home["hdpData"]["homeInfo"]["bathrooms"]
listing_type = home["hdpData"]["homeInfo"].get("homeType")
return Property(
address=address,
agent_name=agent_name,
url=url,
beds=beds,
baths=baths,
listing_type=listing_type,
**price_data,
)
else:
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
address_one, city, state, zip_code = (home[key] for key in keys)
address_one, address_two = cls._parse_address_two(address_one)
address = Address(address_one, city, state, zip_code, address_two)
building_info = cls._extract_building_info(home)
return Building(address=address, url=url, **building_info)
@classmethod
def _get_single_property_page(cls, property_data: dict):
"""
This method is used when a user enters the exact location & zillow returns just one property
"""
url = (
f"https://www.zillow.com{property_data['hdpUrl']}"
if "zillow.com" not in property_data["hdpUrl"]
else property_data["hdpUrl"]
)
address_data = property_data["address"]
address_one, address_two = cls._parse_address_two(address_data["streetAddress"])
address = Address(
address_one=address_one,
address_two=address_two,
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
)
return Property(
address=address,
url=url,
beds=property_data.get("bedrooms", None),
baths=property_data.get("bathrooms", None),
year_built=property_data.get("yearBuilt", None),
price=property_data.get("price", None),
lot_size=property_data.get("lotSize", None),
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
stories=property_data.get("resoFacts", {}).get("stories", None),
description=property_data.get("description", None),
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
price_per_square_foot=property_data.get("resoFacts", {}).get(
"pricePerSquareFoot", None
),
square_feet=property_data.get("livingArea", None),
listing_type=property_data.get("homeType", None),
)
@classmethod
def _extract_building_info(cls, home: dict) -> dict:
num_units = len(home["units"])
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
]
return {
"listing_type": cls.listing_type,
"num_units": len(home["units"]),
"min_unit_price": min(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"max_unit_price": max(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
}
@staticmethod
def _extract_price(home: dict) -> dict:
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
price_per_square_foot = price // square_feet if square_feet and price else None
return {
k: v
for k, v in locals().items()
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
}
@staticmethod
def _extract_agent_name(home: dict) -> str | None:
broker_str = home.get("brokerName", "")
match = re.search(r"Listing by: (.+)", broker_str)
return match.group(1) if match else None
@staticmethod
def _parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
address_two = apt_match.group().strip() if apt_match else None
address_one = (
address_one.replace(address_two, "").strip() if address_two else address_one
)
return address_one, address_two
@staticmethod
def _extract_address(home: dict) -> Address:
keys = ("streetAddress", "city", "state", "zipcode")
address_one, city, state, zip_code = (
home["hdpData"]["homeInfo"][key] for key in keys
)
address_one, address_two = ZillowScraper._parse_address_two(address_one)
return Address(address_one, city, state, zip_code, address_two=address_two)
@staticmethod
def _get_headers():
return {
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}

View File

@ -1,8 +1,14 @@
class InvalidSite(Exception): class InvalidSite(Exception):
"""Raised when a provided site is does not exist.""" """Raised when a provided site is does not exist."""
pass
class InvalidListingType(Exception): class InvalidListingType(Exception):
"""Raised when a provided listing type is does not exist.""" """Raised when a provided listing type is does not exist."""
pass
class NoResultsFound(Exception):
"""Raised when no results are found for the given location"""
class PropertyNotFound(Exception):
"""Raised when no property is found for the given address"""

View File

@ -3,10 +3,7 @@ from homeharvest import scrape_property
def test_realtor(): def test_realtor():
results = [ results = [
scrape_property( scrape_property(location="85281", site_name="realtor.com"),
location="85281",
site_name="realtor.com"
),
] ]
assert all([result is not None for result in results]) assert all([result is not None for result in results])

View File

@ -3,22 +3,10 @@ from homeharvest import scrape_property
def test_redfin(): def test_redfin():
results = [ results = [
scrape_property( scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"),
location="2530 Al Lipscomb Way", scrape_property(location="Phoenix, AZ, USA", site_name="redfin"),
site_name="redfin" scrape_property(location="Dallas, TX, USA", site_name="redfin"),
), scrape_property(location="85281", site_name="redfin"),
scrape_property(
location="Phoenix, AZ, USA",
site_name="redfin"
),
scrape_property(
location="Dallas, TX, USA",
site_name="redfin"
),
scrape_property(
location="85281",
site_name="redfin"
),
] ]
assert all([result is not None for result in results]) assert all([result is not None for result in results])

12
tests/test_zillow.py Normal file
View File

@ -0,0 +1,12 @@
from homeharvest import scrape_property
def test_zillow():
results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"),
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"),
scrape_property(location="Dallas, TX, USA", site_name="zillow"),
scrape_property(location="85281", site_name="zillow"),
]
assert all([result is not None for result in results])