fix: use zillow backend ep
parent
905cfcae2c
commit
dc8c15959f
|
@ -1,7 +1,7 @@
|
||||||
from .core.scrapers.redfin import RedfinScraper
|
from .core.scrapers.redfin import RedfinScraper
|
||||||
from .core.scrapers.realtor import RealtorScraper
|
from .core.scrapers.realtor import RealtorScraper
|
||||||
from .core.scrapers.zillow import ZillowScraper
|
from .core.scrapers.zillow import ZillowScraper
|
||||||
from .core.scrapers.models import ListingType, Property, Building, SiteName
|
from .core.scrapers.models import ListingType, Property, SiteName
|
||||||
from .core.scrapers import ScraperInput
|
from .core.scrapers import ScraperInput
|
||||||
from .exceptions import InvalidSite, InvalidListingType
|
from .exceptions import InvalidSite, InvalidListingType
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
@ -25,60 +25,62 @@ def validate_input(site_name: str, listing_type: str) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_ordered_properties(result: Union[Building, Property]) -> list[str]:
|
def get_ordered_properties(result: Property) -> list[str]:
|
||||||
if isinstance(result, Property):
|
return [
|
||||||
return [
|
"property_url",
|
||||||
"listing_type",
|
"site_name",
|
||||||
"address_one",
|
"listing_type",
|
||||||
"city",
|
"property_type",
|
||||||
"state",
|
"status_text",
|
||||||
"zip_code",
|
"currency",
|
||||||
"address_two",
|
"price",
|
||||||
"url",
|
"apt_min_price",
|
||||||
"property_type",
|
"tax_assessed_value",
|
||||||
"price",
|
"square_feet",
|
||||||
"beds",
|
"price_per_sqft",
|
||||||
"baths",
|
"beds",
|
||||||
"square_feet",
|
"baths",
|
||||||
"price_per_square_foot",
|
"lot_area_value",
|
||||||
"lot_size",
|
"lot_area_unit",
|
||||||
"stories",
|
"street_address",
|
||||||
"year_built",
|
"unit",
|
||||||
"agent_name",
|
"city",
|
||||||
"mls_id",
|
"state",
|
||||||
"description",
|
"zip_code",
|
||||||
]
|
"country",
|
||||||
elif isinstance(result, Building):
|
"posted_time",
|
||||||
return [
|
"bldg_min_beds",
|
||||||
"address_one",
|
"bldg_min_baths",
|
||||||
"city",
|
"bldg_min_area",
|
||||||
"state",
|
"bldg_unit_count",
|
||||||
"zip_code",
|
"bldg_name",
|
||||||
"address_two",
|
"stories",
|
||||||
"url",
|
"year_built",
|
||||||
"num_units",
|
"agent_name",
|
||||||
"min_unit_price",
|
"mls_id",
|
||||||
"max_unit_price",
|
"description",
|
||||||
"avg_unit_price",
|
"img_src",
|
||||||
"listing_type",
|
"latitude",
|
||||||
]
|
"longitude",
|
||||||
return []
|
]
|
||||||
|
|
||||||
|
|
||||||
def process_result(result: Union[Building, Property]) -> pd.DataFrame:
|
def process_result(result: Property) -> pd.DataFrame:
|
||||||
prop_data = result.__dict__
|
prop_data = result.__dict__
|
||||||
|
|
||||||
address_data = prop_data["address"]
|
|
||||||
prop_data["site_name"] = prop_data["site_name"].value
|
prop_data["site_name"] = prop_data["site_name"].value
|
||||||
prop_data["listing_type"] = prop_data["listing_type"].value
|
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
|
||||||
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
||||||
prop_data["address_one"] = address_data.address_one
|
if "address" in prop_data:
|
||||||
prop_data["city"] = address_data.city
|
address_data = prop_data["address"]
|
||||||
prop_data["state"] = address_data.state
|
prop_data["street_address"] = address_data.street_address
|
||||||
prop_data["zip_code"] = address_data.zip_code
|
prop_data["unit"] = address_data.unit
|
||||||
prop_data["address_two"] = address_data.address_two
|
prop_data["city"] = address_data.city
|
||||||
|
prop_data["state"] = address_data.state
|
||||||
|
prop_data["zip_code"] = address_data.zip_code
|
||||||
|
prop_data["country"] = address_data.country
|
||||||
|
|
||||||
del prop_data["address"]
|
del prop_data["address"]
|
||||||
|
|
||||||
properties_df = pd.DataFrame([prop_data])
|
properties_df = pd.DataFrame([prop_data])
|
||||||
properties_df = properties_df[get_ordered_properties(result)]
|
properties_df = properties_df[get_ordered_properties(result)]
|
||||||
|
@ -90,7 +92,7 @@ def scrape_property(
|
||||||
location: str,
|
location: str,
|
||||||
site_name: str,
|
site_name: str,
|
||||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
||||||
) -> Union[list[Building], list[Property]]:
|
) -> list[Property]:
|
||||||
validate_input(site_name, listing_type)
|
validate_input(site_name, listing_type)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
|
@ -103,5 +105,7 @@ def scrape_property(
|
||||||
results = site.search()
|
results = site.search()
|
||||||
|
|
||||||
properties_dfs = [process_result(result) for result in results]
|
properties_dfs = [process_result(result) for result in results]
|
||||||
|
if not properties_dfs:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
return pd.concat(properties_dfs, ignore_index=True)
|
return pd.concat(properties_dfs, ignore_index=True)
|
||||||
|
|
|
@ -9,22 +9,28 @@ class SiteName(Enum):
|
||||||
|
|
||||||
|
|
||||||
class ListingType(Enum):
|
class ListingType(Enum):
|
||||||
FOR_SALE = "for_sale"
|
FOR_SALE = "FOR_SALE"
|
||||||
FOR_RENT = "for_rent"
|
FOR_RENT = "FOR_RENT"
|
||||||
SOLD = "sold"
|
SOLD = "SOLD"
|
||||||
|
|
||||||
|
|
||||||
class PropertyType(Enum):
|
class PropertyType(Enum):
|
||||||
HOUSE = "HOUSE"
|
HOUSE = "HOUSE"
|
||||||
|
BUILDING = "BUILDING"
|
||||||
CONDO = "CONDO"
|
CONDO = "CONDO"
|
||||||
TOWNHOUSE = "TOWNHOUSE"
|
TOWNHOUSE = "TOWNHOUSE"
|
||||||
SINGLE_FAMILY = "SINGLE_FAMILY"
|
SINGLE_FAMILY = "SINGLE_FAMILY"
|
||||||
MULTI_FAMILY = "MULTI_FAMILY"
|
MULTI_FAMILY = "MULTI_FAMILY"
|
||||||
MANUFACTURED = "MANUFACTURED"
|
MANUFACTURED = "MANUFACTURED"
|
||||||
|
NEW_CONSTRUCTION = "NEW_CONSTRUCTION"
|
||||||
APARTMENT = "APARTMENT"
|
APARTMENT = "APARTMENT"
|
||||||
|
APARTMENTS = "APARTMENTS"
|
||||||
LAND = "LAND"
|
LAND = "LAND"
|
||||||
|
LOT = "LOT"
|
||||||
OTHER = "OTHER"
|
OTHER = "OTHER"
|
||||||
|
|
||||||
|
BLANK = "BLANK"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_int_code(cls, code):
|
def from_int_code(cls, code):
|
||||||
mapping = {
|
mapping = {
|
||||||
|
@ -38,48 +44,56 @@ class PropertyType(Enum):
|
||||||
13: cls.SINGLE_FAMILY,
|
13: cls.SINGLE_FAMILY,
|
||||||
}
|
}
|
||||||
|
|
||||||
return mapping.get(code, cls.OTHER)
|
return mapping.get(code, cls.BLANK)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Address:
|
class Address:
|
||||||
address_one: str
|
street_address: str
|
||||||
city: str
|
city: str
|
||||||
state: str
|
state: str
|
||||||
zip_code: str
|
zip_code: str
|
||||||
|
unit: str
|
||||||
address_two: str | None = None
|
country: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass()
|
|
||||||
class Realty:
|
|
||||||
site_name: SiteName
|
|
||||||
address: Address
|
|
||||||
url: str
|
|
||||||
listing_type: ListingType | None = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Property(Realty):
|
class Property:
|
||||||
|
property_url: str
|
||||||
|
site_name: SiteName
|
||||||
|
listing_type: ListingType
|
||||||
|
property_type: PropertyType
|
||||||
|
address: Address
|
||||||
|
|
||||||
|
# house for sale
|
||||||
price: int | None = None
|
price: int | None = None
|
||||||
|
tax_assessed_value: int | None = None
|
||||||
|
currency: str | None = None
|
||||||
|
square_feet: int | None = None
|
||||||
beds: int | None = None
|
beds: int | None = None
|
||||||
baths: float | None = None
|
baths: float | None = None
|
||||||
|
lot_area_value: float | None = None
|
||||||
|
lot_area_unit: str | None = None
|
||||||
stories: int | None = None
|
stories: int | None = None
|
||||||
year_built: int | None = None
|
year_built: int | None = None
|
||||||
square_feet: int | None = None
|
price_per_sqft: int | None = None
|
||||||
price_per_square_foot: int | None = None
|
|
||||||
year_built: int | None = None
|
year_built: int | None = None
|
||||||
mls_id: str | None = None
|
mls_id: str | None = None
|
||||||
|
|
||||||
agent_name: str | None = None
|
agent_name: str | None = None
|
||||||
property_type: PropertyType | None = None
|
img_src: str | None = None
|
||||||
lot_size: int | None = None
|
|
||||||
description: str | None = None
|
description: str | None = None
|
||||||
|
status_text: str | None = None
|
||||||
|
latitude: float | None = None
|
||||||
|
longitude: float | None = None
|
||||||
|
posted_time: str | None = None
|
||||||
|
|
||||||
|
# building for sale
|
||||||
|
bldg_name: str | None = None
|
||||||
|
bldg_unit_count: int | None = None
|
||||||
|
bldg_min_beds: int | None = None
|
||||||
|
bldg_min_baths: float | None = None
|
||||||
|
bldg_min_area: int | None = None
|
||||||
|
|
||||||
@dataclass
|
# apt
|
||||||
class Building(Realty):
|
apt_min_price: int | None = None
|
||||||
num_units: int | None = None
|
|
||||||
min_unit_price: int | None = None
|
|
||||||
max_unit_price: int | None = None
|
|
||||||
avg_unit_price: int | None = None
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from ..models import Property, Address, Building, ListingType, PropertyType
|
from ..models import Property, Address, ListingType, PropertyType, SiteName
|
||||||
from ....exceptions import NoResultsFound, PropertyNotFound
|
from ....exceptions import NoResultsFound, PropertyNotFound
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
|
|
||||||
|
@ -13,6 +13,8 @@ class ZillowScraper(Scraper):
|
||||||
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
||||||
elif self.listing_type == ListingType.FOR_RENT:
|
elif self.listing_type == ListingType.FOR_RENT:
|
||||||
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
|
||||||
|
else:
|
||||||
|
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
resp = self.session.get(self.url, headers=self._get_headers())
|
resp = self.session.get(self.url, headers=self._get_headers())
|
||||||
|
@ -33,10 +35,17 @@ class ZillowScraper(Scraper):
|
||||||
data = json.loads(json_str)
|
data = json.loads(json_str)
|
||||||
|
|
||||||
if "searchPageState" in data["props"]["pageProps"]:
|
if "searchPageState" in data["props"]["pageProps"]:
|
||||||
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
|
pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};'
|
||||||
"searchResults"
|
|
||||||
]["listResults"]
|
match = re.search(pattern, content)
|
||||||
return [self._parse_home(house) for house in houses]
|
|
||||||
|
if match:
|
||||||
|
coords = [float(coord) for coord in match.groups()]
|
||||||
|
return self._fetch_properties_backend(coords)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise BoxBoundsNotFound("Box bounds could not be located.")
|
||||||
|
|
||||||
elif "gdpClientCache" in data["props"]["pageProps"]:
|
elif "gdpClientCache" in data["props"]["pageProps"]:
|
||||||
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
|
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
|
||||||
main_key = list(gdp_client_cache.keys())[0]
|
main_key = list(gdp_client_cache.keys())[0]
|
||||||
|
@ -47,45 +56,188 @@ class ZillowScraper(Scraper):
|
||||||
return [property]
|
return [property]
|
||||||
raise PropertyNotFound("Specific property data not found in the response.")
|
raise PropertyNotFound("Specific property data not found in the response.")
|
||||||
|
|
||||||
def _parse_home(self, home: dict):
|
def _fetch_properties_backend(self, coords):
|
||||||
"""
|
url = "https://www.zillow.com/async-create-search-page-state"
|
||||||
This method is used when a user enters a generic location & zillow returns more than one property
|
|
||||||
"""
|
filter_state_for_sale = {
|
||||||
url = (
|
"sortSelection": {
|
||||||
f"https://www.zillow.com{home['detailUrl']}"
|
# "value": "globalrelevanceex"
|
||||||
if "zillow.com" not in home["detailUrl"]
|
"value": "days"
|
||||||
else home["detailUrl"]
|
},
|
||||||
|
"isAllHomes": {"value": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_state_for_rent = {
|
||||||
|
"isForRent": {"value": True},
|
||||||
|
"isForSaleByAgent": {"value": False},
|
||||||
|
"isForSaleByOwner": {"value": False},
|
||||||
|
"isNewConstruction": {"value": False},
|
||||||
|
"isComingSoon": {"value": False},
|
||||||
|
"isAuction": {"value": False},
|
||||||
|
"isForSaleForeclosure": {"value": False},
|
||||||
|
"isAllHomes": {"value": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_state_sold = {
|
||||||
|
"isRecentlySold": {"value": True},
|
||||||
|
"isForSaleByAgent": {"value": False},
|
||||||
|
"isForSaleByOwner": {"value": False},
|
||||||
|
"isNewConstruction": {"value": False},
|
||||||
|
"isComingSoon": {"value": False},
|
||||||
|
"isAuction": {"value": False},
|
||||||
|
"isForSaleForeclosure": {"value": False},
|
||||||
|
"isAllHomes": {"value": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
selected_filter = (
|
||||||
|
filter_state_for_rent
|
||||||
|
if self.listing_type == ListingType.FOR_RENT
|
||||||
|
else filter_state_for_sale
|
||||||
|
if self.listing_type == ListingType.FOR_SALE
|
||||||
|
else filter_state_sold
|
||||||
)
|
)
|
||||||
|
|
||||||
if "hdpData" in home and "homeInfo" in home["hdpData"]:
|
payload = json.dumps(
|
||||||
price_data = self._extract_price(home)
|
{
|
||||||
address = self._extract_address(home)
|
"searchQueryState": {
|
||||||
agent_name = self._extract_agent_name(home)
|
"pagination": {},
|
||||||
beds = home["hdpData"]["homeInfo"]["bedrooms"]
|
"isMapVisible": True,
|
||||||
baths = home["hdpData"]["homeInfo"]["bathrooms"]
|
"mapBounds": {
|
||||||
property_type = home["hdpData"]["homeInfo"].get("homeType")
|
"west": coords[0],
|
||||||
|
"east": coords[1],
|
||||||
|
"south": coords[2],
|
||||||
|
"north": coords[3],
|
||||||
|
},
|
||||||
|
"filterState": selected_filter,
|
||||||
|
"isListVisible": True,
|
||||||
|
"mapZoom": 11,
|
||||||
|
},
|
||||||
|
"wants": {"cat1": ["mapResults"]},
|
||||||
|
"isDebugRequest": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(payload)
|
||||||
|
resp = self.session.put(url, headers=self._get_headers(), data=payload)
|
||||||
|
resp.raise_for_status()
|
||||||
|
a = resp.json()
|
||||||
|
return self._parse_properties(resp.json())
|
||||||
|
|
||||||
return Property(
|
def _parse_properties(self, property_data: dict):
|
||||||
site_name=self.site_name,
|
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
|
||||||
address=address,
|
|
||||||
agent_name=agent_name,
|
|
||||||
url=url,
|
|
||||||
beds=beds,
|
|
||||||
baths=baths,
|
|
||||||
listing_type=self.listing_type,
|
|
||||||
property_type=PropertyType(property_type),
|
|
||||||
**price_data,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
|
|
||||||
address_one, city, state, zip_code = (home[key] for key in keys)
|
|
||||||
address_one, address_two = self._parse_address_two(address_one)
|
|
||||||
address = Address(address_one, city, state, zip_code, address_two)
|
|
||||||
|
|
||||||
building_info = self._extract_building_info(home)
|
properties_list = []
|
||||||
return Building(
|
|
||||||
site_name=self.site_name, address=address, url=url, **building_info
|
for result in mapresults:
|
||||||
|
try:
|
||||||
|
if "hdpData" in result:
|
||||||
|
home_info = result["hdpData"]["homeInfo"]
|
||||||
|
address_data = {
|
||||||
|
"street_address": home_info["streetAddress"],
|
||||||
|
"unit": home_info.get("unit"),
|
||||||
|
"city": home_info["city"],
|
||||||
|
"state": home_info["state"],
|
||||||
|
"zip_code": home_info["zipcode"],
|
||||||
|
"country": home_info["country"],
|
||||||
|
}
|
||||||
|
property_data = {
|
||||||
|
"site_name": self.site_name,
|
||||||
|
"address": Address(**address_data),
|
||||||
|
"property_url": f"https://www.zillow.com{result['detailUrl']}",
|
||||||
|
"beds": int(home_info["bedrooms"])
|
||||||
|
if "bedrooms" in home_info
|
||||||
|
else None,
|
||||||
|
"baths": home_info.get("bathrooms"),
|
||||||
|
"square_feet": int(home_info["livingArea"])
|
||||||
|
if "livingArea" in home_info
|
||||||
|
else None,
|
||||||
|
"currency": home_info["currency"],
|
||||||
|
"price": home_info.get("price"),
|
||||||
|
"square_feet": int(home_info["livingArea"])
|
||||||
|
if "livingArea" in home_info
|
||||||
|
else None,
|
||||||
|
"tax_assessed_value": int(home_info["taxAssessedValue"])
|
||||||
|
if "taxAssessedValue" in home_info
|
||||||
|
else None,
|
||||||
|
"property_type": PropertyType(home_info["homeType"]),
|
||||||
|
"listing_type": ListingType(
|
||||||
|
home_info["statusType"]
|
||||||
|
if "statusType" in home_info
|
||||||
|
else self.listing_type
|
||||||
|
),
|
||||||
|
"lot_area_value": round(home_info["lotAreaValue"], 2)
|
||||||
|
if "lotAreaValue" in home_info
|
||||||
|
else None,
|
||||||
|
"lot_area_unit": home_info.get("lotAreaUnit"),
|
||||||
|
"latitude": result["latLong"]["latitude"],
|
||||||
|
"longitude": result["latLong"]["longitude"],
|
||||||
|
"status_text": result.get("statusText"),
|
||||||
|
"posted_time": result["variableData"]["text"]
|
||||||
|
if "variableData" in result
|
||||||
|
and "text" in result["variableData"]
|
||||||
|
and result["variableData"]["type"] == "TIME_ON_INFO"
|
||||||
|
else None,
|
||||||
|
"img_src": result.get("imgSrc"),
|
||||||
|
"price_per_sqft": int(
|
||||||
|
home_info["price"] // home_info["livingArea"]
|
||||||
|
)
|
||||||
|
if "livingArea" in home_info and "price" in home_info
|
||||||
|
else None,
|
||||||
|
}
|
||||||
|
property_obj = Property(**property_data)
|
||||||
|
properties_list.append(property_obj)
|
||||||
|
|
||||||
|
elif "isBuilding" in result:
|
||||||
|
price = result["price"]
|
||||||
|
building_data = {
|
||||||
|
"property_url": f"https://www.zillow.com{result['detailUrl']}",
|
||||||
|
"site_name": self.site_name,
|
||||||
|
"property_type": PropertyType("BUILDING"),
|
||||||
|
"listing_type": ListingType(result["statusType"]),
|
||||||
|
"img_src": result["imgSrc"],
|
||||||
|
"price": int(price.replace("From $", "").replace(",", ""))
|
||||||
|
if "From $" in price
|
||||||
|
else None,
|
||||||
|
"apt_min_price": int(
|
||||||
|
price.replace("$", "").replace(",", "").replace("+/mo", "")
|
||||||
|
)
|
||||||
|
if "+/mo" in price
|
||||||
|
else None,
|
||||||
|
"address": self._extract_address(result["address"]),
|
||||||
|
"bldg_min_beds": result["minBeds"],
|
||||||
|
"currency": "USD",
|
||||||
|
"bldg_min_baths": result["minBaths"],
|
||||||
|
"bldg_min_area": result.get("minArea"),
|
||||||
|
"bldg_unit_count": result["unitCount"],
|
||||||
|
"bldg_name": result.get("communityName"),
|
||||||
|
"status_text": result["statusText"],
|
||||||
|
"latitude": result["latLong"]["latitude"],
|
||||||
|
"longitude": result["latLong"]["longitude"],
|
||||||
|
}
|
||||||
|
building_obj = Property(**building_data)
|
||||||
|
properties_list.append(building_obj)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(home_info)
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
return properties_list
|
||||||
|
|
||||||
|
def _extract_units(self, result: dict):
|
||||||
|
units = {}
|
||||||
|
if "units" in result:
|
||||||
|
num_units = result.get("availabilityCount", len(result["units"]))
|
||||||
|
prices = [
|
||||||
|
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||||
|
for unit in result["units"]
|
||||||
|
]
|
||||||
|
units["apt_availability_count"] = num_units
|
||||||
|
units["apt_min_unit_price"] = min(prices)
|
||||||
|
units["apt_max_unit_price"] = max(prices)
|
||||||
|
units["apt_avg_unit_price"] = (
|
||||||
|
sum(prices) // num_units if num_units else None
|
||||||
)
|
)
|
||||||
|
return units
|
||||||
|
|
||||||
def _get_single_property_page(self, property_data: dict):
|
def _get_single_property_page(self, property_data: dict):
|
||||||
"""
|
"""
|
||||||
|
@ -97,32 +249,38 @@ class ZillowScraper(Scraper):
|
||||||
else property_data["hdpUrl"]
|
else property_data["hdpUrl"]
|
||||||
)
|
)
|
||||||
address_data = property_data["address"]
|
address_data = property_data["address"]
|
||||||
address_one, address_two = self._parse_address_two(
|
unit = self._parse_address_two(address_data["streetAddress"])
|
||||||
address_data["streetAddress"]
|
|
||||||
)
|
|
||||||
address = Address(
|
address = Address(
|
||||||
address_one=address_one,
|
street_address=address_data["streetAddress"],
|
||||||
address_two=address_two,
|
unit=unit,
|
||||||
city=address_data["city"],
|
city=address_data["city"],
|
||||||
state=address_data["state"],
|
state=address_data["state"],
|
||||||
zip_code=address_data["zipcode"],
|
zip_code=address_data["zipcode"],
|
||||||
|
country=property_data.get("country"),
|
||||||
)
|
)
|
||||||
property_type = property_data.get("homeType", None)
|
property_type = property_data.get("homeType", None)
|
||||||
|
|
||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
address=address,
|
address=address,
|
||||||
url=url,
|
property_url=url,
|
||||||
beds=property_data.get("bedrooms", None),
|
beds=property_data.get("bedrooms", None),
|
||||||
baths=property_data.get("bathrooms", None),
|
baths=property_data.get("bathrooms", None),
|
||||||
year_built=property_data.get("yearBuilt", None),
|
year_built=property_data.get("yearBuilt", None),
|
||||||
price=property_data.get("price", None),
|
price=property_data.get("price", None),
|
||||||
lot_size=property_data.get("lotSize", None),
|
tax_assessed_value=property_data.get("taxAssessedValue", None),
|
||||||
|
latitude=property_data.get("latitude"),
|
||||||
|
longitude=property_data.get("longitude"),
|
||||||
|
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
|
||||||
|
currency=property_data.get("currency", None),
|
||||||
|
lot_area_value=property_data.get("lotAreaValue"),
|
||||||
|
lot_area_unit=property_data["lotAreaUnits"].lower()
|
||||||
|
if "lotAreaUnits" in property_data
|
||||||
|
else None,
|
||||||
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
|
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
|
||||||
stories=property_data.get("resoFacts", {}).get("stories", None),
|
stories=property_data.get("resoFacts", {}).get("stories", None),
|
||||||
description=property_data.get("description", None),
|
description=property_data.get("description", None),
|
||||||
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
|
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
|
||||||
price_per_square_foot=property_data.get("resoFacts", {}).get(
|
price_per_sqft=property_data.get("resoFacts", {}).get(
|
||||||
"pricePerSquareFoot", None
|
"pricePerSquareFoot", None
|
||||||
),
|
),
|
||||||
square_feet=property_data.get("livingArea", None),
|
square_feet=property_data.get("livingArea", None),
|
||||||
|
@ -130,81 +288,59 @@ class ZillowScraper(Scraper):
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_building_info(self, home: dict) -> dict:
|
|
||||||
num_units = len(home["units"])
|
|
||||||
prices = [
|
|
||||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
|
||||||
for unit in home["units"]
|
|
||||||
]
|
|
||||||
return {
|
|
||||||
"listing_type": self.listing_type,
|
|
||||||
"num_units": len(home["units"]),
|
|
||||||
"min_unit_price": min(
|
|
||||||
(
|
|
||||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
|
||||||
for unit in home["units"]
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"max_unit_price": max(
|
|
||||||
(
|
|
||||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
|
||||||
for unit in home["units"]
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_price(home: dict) -> dict:
|
|
||||||
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
|
|
||||||
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
|
|
||||||
|
|
||||||
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
|
|
||||||
price_per_square_foot = price // square_feet if square_feet and price else None
|
|
||||||
|
|
||||||
return {
|
|
||||||
k: v
|
|
||||||
for k, v in locals().items()
|
|
||||||
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_agent_name(home: dict) -> str | None:
|
|
||||||
broker_str = home.get("brokerName", "")
|
|
||||||
match = re.search(r"Listing by: (.+)", broker_str)
|
|
||||||
return match.group(1) if match else None
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_address_two(address_one: str):
|
def _parse_address_two(address_one: str):
|
||||||
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
|
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
|
||||||
address_two = apt_match.group().strip() if apt_match else None
|
return apt_match.group().strip() if apt_match else None
|
||||||
address_one = (
|
|
||||||
address_one.replace(address_two, "").strip() if address_two else address_one
|
|
||||||
)
|
|
||||||
return address_one, address_two
|
|
||||||
|
|
||||||
@staticmethod
|
def _extract_address(self, address_str):
|
||||||
def _extract_address(home: dict) -> Address:
|
"""
|
||||||
keys = ("streetAddress", "city", "state", "zipcode")
|
Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
|
||||||
address_one, city, state, zip_code = (
|
and return an Address object.
|
||||||
home["hdpData"]["homeInfo"][key] for key in keys
|
"""
|
||||||
|
parts = address_str.split(", ")
|
||||||
|
|
||||||
|
if len(parts) != 3:
|
||||||
|
raise ValueError(f"Unexpected address format: {address_str}")
|
||||||
|
|
||||||
|
street_address = parts[0].strip()
|
||||||
|
city = parts[1].strip()
|
||||||
|
state_zip = parts[2].split(" ")
|
||||||
|
|
||||||
|
if len(state_zip) == 1:
|
||||||
|
state = state_zip[0].strip()
|
||||||
|
zip_code = None
|
||||||
|
elif len(state_zip) == 2:
|
||||||
|
state = state_zip[0].strip()
|
||||||
|
zip_code = state_zip[1].strip()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||||
|
|
||||||
|
unit = self._parse_address_two(street_address)
|
||||||
|
return Address(
|
||||||
|
street_address=street_address,
|
||||||
|
city=city,
|
||||||
|
unit=unit,
|
||||||
|
state=state,
|
||||||
|
zip_code=zip_code,
|
||||||
|
country="USA",
|
||||||
)
|
)
|
||||||
address_one, address_two = ZillowScraper._parse_address_two(address_one)
|
|
||||||
return Address(address_one, city, state, zip_code, address_two=address_two)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_headers():
|
def _get_headers():
|
||||||
return {
|
return {
|
||||||
"authority": "parser-external.geo.moveaws.com",
|
"authority": "www.zillow.com",
|
||||||
"accept": "*/*",
|
"accept": "*/*",
|
||||||
"accept-language": "en-US,en;q=0.9",
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
|
||||||
"origin": "https://www.zillow.com",
|
"origin": "https://www.zillow.com",
|
||||||
"referer": "https://www.zillow.com/",
|
"referer": "https://www.zillow.com/homes/Dallas,-TX_rb/",
|
||||||
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
||||||
"sec-ch-ua-mobile": "?0",
|
"sec-ch-ua-mobile": "?0",
|
||||||
"sec-ch-ua-platform": '"Windows"',
|
"sec-ch-ua-platform": '"Windows"',
|
||||||
"sec-fetch-dest": "empty",
|
"sec-fetch-dest": "empty",
|
||||||
"sec-fetch-mode": "cors",
|
"sec-fetch-mode": "cors",
|
||||||
"sec-fetch-site": "cross-site",
|
"sec-fetch-site": "same-origin",
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,3 +12,7 @@ class NoResultsFound(Exception):
|
||||||
|
|
||||||
class PropertyNotFound(Exception):
|
class PropertyNotFound(Exception):
|
||||||
"""Raised when no property is found for the given address"""
|
"""Raised when no property is found for the given address"""
|
||||||
|
|
||||||
|
|
||||||
|
class BoxBoundsNotFound(Exception):
|
||||||
|
"""Raised when no property is found for the given address"""
|
||||||
|
|
Loading…
Reference in New Issue