feat(scrapers): add zillow

pull/1/head
Cullen Watson 2023-09-17 15:06:31 -05:00
parent 2f3b012747
commit 2f5ea1ca88
11 changed files with 349 additions and 97 deletions

5
.gitignore vendored
View File

@ -1,2 +1,5 @@
/.idea
dist
**/dist/
**/__pycache__/
**/.pytest_cache/
*.pyc

View File

@ -1,13 +1,16 @@
from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.types import ListingType, Property
from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType
from typing import Union
_scrapers = {
"redfin": RedfinScraper,
"realtor.com": RealtorScraper
"realtor.com": RealtorScraper,
"zillow": ZillowScraper,
}
@ -15,12 +18,14 @@ def scrape_property(
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> list[Property]: #: eventually, return pandas dataframe
) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe
if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist."
)
scraper_input = ScraperInput(
location=location,

View File

@ -1,6 +1,6 @@
from dataclasses import dataclass
import requests
from .types import Property, ListingType
from .models import Property, ListingType
@dataclass
@ -11,9 +11,12 @@ class ScraperInput:
class Scraper:
listing_type = ListingType.FOR_SALE
def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location
self.session = requests.Session()
Scraper.listing_type = scraper_input.listing_type
if scraper_input.proxy_url:
self.session.proxies = {
@ -21,9 +24,12 @@ class Scraper:
"https": scraper_input.proxy_url,
}
def search(self) -> list[Property]: ...
def search(self) -> list[Property]:
...
@staticmethod
def parse_home(home) -> Property: ...
def _parse_home(home) -> Property:
...
def handle_location(self): ...
def handle_location(self):
...

View File

@ -24,14 +24,29 @@ class Property:
url: str
beds: int | None = None
baths: int | None = None
baths: float | None = None
stories: int | None = None
agent_name: str | None = None
description: str | None = None
year_built: int | None = None
square_feet: int | None = None
price_per_square_foot: int | None = None
year_built: int | None = None
price: int | None = None
mls_id: str | None = None
property_type: str | None = None
listing_type: ListingType | None = None
lot_size: int | None = None
description: str | None = None
@dataclass
class Building:
address: Address
url: str
num_units: int | None = None
min_unit_price: int | None = None
max_unit_price: int | None = None
avg_unit_price: int | None = None
listing_type: str | None = None

View File

@ -1,5 +1,5 @@
import json
from ..types import Property, Address
from ..models import Property, Address
from .. import Scraper
from typing import Any
@ -10,39 +10,42 @@ class RealtorScraper(Scraper):
def handle_location(self):
headers = {
'authority': 'parser-external.geo.moveaws.com',
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'origin': 'https://www.realtor.com',
'referer': 'https://www.realtor.com/',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.realtor.com",
"referer": "https://www.realtor.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}
params = {
'input': self.location,
'client_id': 'for-sale',
'limit': '1',
'area_types': 'city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park',
"input": self.location,
"client_id": "for-sale",
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
}
response = self.session.get('https://parser-external.geo.moveaws.com/suggest', params=params, headers=headers)
response = self.session.get(
"https://parser-external.geo.moveaws.com/suggest",
params=params,
headers=headers,
)
response_json = response.json()
return response_json['autocomplete'][0]
return response_json["autocomplete"][0]
def search(self):
location_info = self.handle_location()
location_type = location_info['area_type']
location_type = location_info["area_type"]
"""
property types:
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
"""
print('a')
print("a")

View File

@ -1,5 +1,5 @@
import json
from ..types import Property, Address
from ..models import Property, Address
from .. import Scraper
from typing import Any
@ -8,11 +8,13 @@ class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
def handle_location(self):
url = 'https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}'.format(self.location)
def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
def get_region_type(match_type: str):
if match_type == "4":
@ -22,51 +24,53 @@ class RedfinScraper(Scraper):
elif match_type == "1":
return "address" #: address, needs to be handled differently
if response_json['payload']['exactMatch'] is not None:
target = response_json['payload']['exactMatch']
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
else:
target = response_json['payload']['sections'][0]['rows'][0]
target = response_json["payload"]["sections"][0]["rows"][0]
return target['id'].split('_')[1], get_region_type(target['type'])
return target["id"].split("_")[1], get_region_type(target["type"])
@staticmethod
def parse_home(home: dict, single_search: bool = False) -> Property:
def _parse_home(home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and 'value' in home[key]:
return home[key]['value']
if key in home and "value" in home[key]:
return home[key]["value"]
if not single_search:
address = Address(
address_one=get_value('streetLine'),
city=home['city'],
state=home['state'],
zip_code=home['zip']
address_one=get_value("streetLine"),
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)
else:
address_info = home['streetAddress']
address_info = home["streetAddress"]
address = Address(
address_one=address_info['assembledAddress'],
city=home['city'],
state=home['state'],
zip_code=home['zip']
address_one=address_info["assembledAddress"],
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)
url = 'https://www.redfin.com{}'.format(home['url'])
url = "https://www.redfin.com{}".format(home["url"])
return Property(
address=address,
url=url,
beds=home['beds'] if 'beds' in home else None,
baths=home['baths'] if 'baths' in home else None,
stories=home['stories'] if 'stories' in home else None,
agent_name=get_value('listingAgent'),
description=home['listingRemarks'] if 'listingRemarks' in home else None,
year_built=get_value('yearBuilt') if not single_search else home['yearBuilt'],
square_feet=get_value('sqFt'),
price_per_square_foot=get_value('pricePerSqFt'),
price=get_value('price'),
mls_id=get_value('mlsId')
beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt")
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
price_per_square_foot=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
)
def handle_address(self, home_id: str):
@ -78,25 +82,33 @@ class RedfinScraper(Scraper):
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(home_id)
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
parsed_home = self.parse_home(response_json['payload']['addressSectionInfo'], single_search=True)
parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home]
def search(self):
region_id, region_type = self.handle_location()
region_id, region_type = self._handle_location()
if region_type == "address":
home_id = region_id
return self.handle_address(home_id)
url = 'https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}'.format(region_id, region_type)
url = "https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}".format(
region_id, region_type
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
homes = [self.parse_home(home) for home in response_json['payload']['homes']] #: support buildings
homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]
] #: support buildings
return homes

View File

@ -0,0 +1,205 @@
import re
import json
from ..models import Property, Address, Building, ListingType
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper
class ZillowScraper(Scraper):
listing_type: ListingType.FOR_SALE
def __init__(self, scraper_input):
super().__init__(scraper_input)
if self.listing_type == ListingType.FOR_SALE:
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
elif self.listing_type == ListingType.FOR_RENT:
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
def search(self):
resp = self.session.get(self.url, headers=self._get_headers())
resp.raise_for_status()
content = resp.text
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
content,
re.DOTALL,
)
if not match:
raise NoResultsFound(
"No results were found for Zillow with the given Location."
)
json_str = match.group(1)
data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]:
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
"searchResults"
]["listResults"]
return [self._parse_home(house) for house in houses]
elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0]
property_data = gdp_client_cache[main_key]["property"]
property = self._get_single_property_page(property_data)
return [property]
raise PropertyNotFound("Specific property data not found in the response.")
@classmethod
def _parse_home(cls, home: dict):
"""
This method is used when a user enters a generic location & zillow returns more than one property
"""
url = (
f"https://www.zillow.com{home['detailUrl']}"
if "zillow.com" not in home["detailUrl"]
else home["detailUrl"]
)
if "hdpData" in home and "homeInfo" in home["hdpData"]:
price_data = cls._extract_price(home)
address = cls._extract_address(home)
agent_name = cls._extract_agent_name(home)
beds = home["hdpData"]["homeInfo"]["bedrooms"]
baths = home["hdpData"]["homeInfo"]["bathrooms"]
listing_type = home["hdpData"]["homeInfo"].get("homeType")
return Property(
address=address,
agent_name=agent_name,
url=url,
beds=beds,
baths=baths,
listing_type=listing_type,
**price_data,
)
else:
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
address_one, city, state, zip_code = (home[key] for key in keys)
address_one, address_two = cls._parse_address_two(address_one)
address = Address(address_one, city, state, zip_code, address_two)
building_info = cls._extract_building_info(home)
return Building(address=address, url=url, **building_info)
@classmethod
def _get_single_property_page(cls, property_data: dict):
"""
This method is used when a user enters the exact location & zillow returns just one property
"""
url = (
f"https://www.zillow.com{property_data['hdpUrl']}"
if "zillow.com" not in property_data["hdpUrl"]
else property_data["hdpUrl"]
)
address_data = property_data["address"]
address_one, address_two = cls._parse_address_two(address_data["streetAddress"])
address = Address(
address_one=address_one,
address_two=address_two,
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
)
return Property(
address=address,
url=url,
beds=property_data.get("bedrooms", None),
baths=property_data.get("bathrooms", None),
year_built=property_data.get("yearBuilt", None),
price=property_data.get("price", None),
lot_size=property_data.get("lotSize", None),
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
stories=property_data.get("resoFacts", {}).get("stories", None),
description=property_data.get("description", None),
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
price_per_square_foot=property_data.get("resoFacts", {}).get(
"pricePerSquareFoot", None
),
square_feet=property_data.get("livingArea", None),
listing_type=property_data.get("homeType", None),
)
@classmethod
def _extract_building_info(cls, home: dict) -> dict:
num_units = len(home["units"])
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
]
return {
"listing_type": cls.listing_type,
"num_units": len(home["units"]),
"min_unit_price": min(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"max_unit_price": max(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
}
@staticmethod
def _extract_price(home: dict) -> dict:
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
price_per_square_foot = price // square_feet if square_feet and price else None
return {
k: v
for k, v in locals().items()
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
}
@staticmethod
def _extract_agent_name(home: dict) -> str | None:
broker_str = home.get("brokerName", "")
match = re.search(r"Listing by: (.+)", broker_str)
return match.group(1) if match else None
@staticmethod
def _parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
address_two = apt_match.group().strip() if apt_match else None
address_one = (
address_one.replace(address_two, "").strip() if address_two else address_one
)
return address_one, address_two
@staticmethod
def _extract_address(home: dict) -> Address:
keys = ("streetAddress", "city", "state", "zipcode")
address_one, city, state, zip_code = (
home["hdpData"]["homeInfo"][key] for key in keys
)
address_one, address_two = ZillowScraper._parse_address_two(address_one)
return Address(address_one, city, state, zip_code, address_two=address_two)
@staticmethod
def _get_headers():
return {
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}

View File

@ -1,8 +1,14 @@
class InvalidSite(Exception):
"""Raised when a provided site is does not exist."""
pass
class InvalidListingType(Exception):
"""Raised when a provided listing type is does not exist."""
pass
class NoResultsFound(Exception):
"""Raised when no results are found for the given location"""
class PropertyNotFound(Exception):
"""Raised when no property is found for the given address"""

View File

@ -3,10 +3,7 @@ from homeharvest import scrape_property
def test_realtor():
results = [
scrape_property(
location="85281",
site_name="realtor.com"
),
scrape_property(location="85281", site_name="realtor.com"),
]
assert all([result is not None for result in results])

View File

@ -3,22 +3,10 @@ from homeharvest import scrape_property
def test_redfin():
results = [
scrape_property(
location="2530 Al Lipscomb Way",
site_name="redfin"
),
scrape_property(
location="Phoenix, AZ, USA",
site_name="redfin"
),
scrape_property(
location="Dallas, TX, USA",
site_name="redfin"
),
scrape_property(
location="85281",
site_name="redfin"
),
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"),
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"),
scrape_property(location="Dallas, TX, USA", site_name="redfin"),
scrape_property(location="85281", site_name="redfin"),
]
assert all([result is not None for result in results])

12
tests/test_zillow.py Normal file
View File

@ -0,0 +1,12 @@
from homeharvest import scrape_property
def test_zillow():
results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"),
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"),
scrape_property(location="Dallas, TX, USA", site_name="zillow"),
scrape_property(location="85281", site_name="zillow"),
]
assert all([result is not None for result in results])