Merge pull request #1 from ZacharyHampton/zillow_backend_ep

pull/2/head
Zachary Hampton 2023-09-18 13:52:43 -07:00 committed by GitHub
commit fe351ab57c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 500 additions and 283 deletions

View File

@ -1,7 +1,7 @@
from .core.scrapers.redfin import RedfinScraper from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building, SiteName from .core.scrapers.models import ListingType, Property, SiteName
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType from .exceptions import InvalidSite, InvalidListingType
from typing import Union from typing import Union
@ -25,60 +25,65 @@ def validate_input(site_name: str, listing_type: str) -> None:
) )
def get_ordered_properties(result: Union[Building, Property]) -> list[str]: def get_ordered_properties(result: Property) -> list[str]:
if isinstance(result, Property): return [
return [ "property_url",
"listing_type", "site_name",
"address_one", "listing_type",
"city", "property_type",
"state", "status_text",
"zip_code", "currency",
"address_two", "price",
"url", "apt_min_price",
"property_type", "tax_assessed_value",
"price", "square_feet",
"beds", "price_per_sqft",
"baths", "beds",
"square_feet", "baths",
"price_per_square_foot", "lot_area_value",
"lot_size", "lot_area_unit",
"stories", "street_address",
"year_built", "unit",
"agent_name", "city",
"mls_id", "state",
"description", "zip_code",
] "country",
elif isinstance(result, Building): "posted_time",
return [ "bldg_min_beds",
"address_one", "bldg_min_baths",
"city", "bldg_min_area",
"state", "bldg_unit_count",
"zip_code", "bldg_name",
"address_two", "stories",
"url", "year_built",
"num_units", "agent_name",
"min_unit_price", "mls_id",
"max_unit_price", "description",
"avg_unit_price", "img_src",
"listing_type", "latitude",
] "longitude",
return [] ]
def process_result(result: Union[Building, Property]) -> pd.DataFrame: def process_result(result: Property) -> pd.DataFrame:
prop_data = result.__dict__ prop_data = result.__dict__
address_data = prop_data["address"] prop_data["site_name"] = prop_data["site_name"].value
prop_data["site_name"] = prop_data["site_name"] prop_data["listing_type"] = prop_data["listing_type"].value.lower()
prop_data["listing_type"] = prop_data["listing_type"].value if "property_type" in prop_data and prop_data["property_type"] is not None:
prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data.get("property_type") else None prop_data["property_type"] = prop_data["property_type"].value.lower()
prop_data["address_one"] = address_data.address_one else:
prop_data["city"] = address_data.city prop_data["property_type"] = None
prop_data["state"] = address_data.state if "address" in prop_data:
prop_data["zip_code"] = address_data.zip_code address_data = prop_data["address"]
prop_data["address_two"] = address_data.address_two prop_data["street_address"] = address_data.street_address
prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city
prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code
prop_data["country"] = address_data.country
del prop_data["address"] del prop_data["address"]
properties_df = pd.DataFrame([prop_data]) properties_df = pd.DataFrame([prop_data])
properties_df = properties_df[get_ordered_properties(result)] properties_df = properties_df[get_ordered_properties(result)]
@ -106,12 +111,14 @@ def scrape_property(
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType[listing_type.upper()], listing_type=ListingType[listing_type.upper()],
site_name=site_name.lower(), site_name=SiteName.get_by_value(site_name.lower()),
) )
site = _scrapers[site_name.lower()](scraper_input) site = _scrapers[site_name.lower()](scraper_input)
results = site.search() results = site.search()
properties_dfs = [process_result(result) for result in results] properties_dfs = [process_result(result) for result in results]
if not properties_dfs:
return pd.DataFrame()
return pd.concat(properties_dfs, ignore_index=True) return pd.concat(properties_dfs, ignore_index=True)

View File

@ -7,7 +7,7 @@ from .models import Property, ListingType, SiteName
class ScraperInput: class ScraperInput:
location: str location: str
listing_type: ListingType listing_type: ListingType
site_name: str site_name: SiteName
proxy_url: str | None = None proxy_url: str | None = None

View File

@ -7,24 +7,37 @@ class SiteName(Enum):
REDFIN = "redfin" REDFIN = "redfin"
REALTOR = "realtor.com" REALTOR = "realtor.com"
@classmethod
def get_by_value(cls, value):
for item in cls:
if item.value == value:
return item
raise ValueError(f"{value} not found in {cls}")
class ListingType(Enum): class ListingType(Enum):
FOR_SALE = "for_sale" FOR_SALE = "FOR_SALE"
FOR_RENT = "for_rent" FOR_RENT = "FOR_RENT"
SOLD = "sold" SOLD = "SOLD"
class PropertyType(Enum): class PropertyType(Enum):
HOUSE = "HOUSE" HOUSE = "HOUSE"
BUILDING = "BUILDING"
CONDO = "CONDO" CONDO = "CONDO"
TOWNHOUSE = "TOWNHOUSE" TOWNHOUSE = "TOWNHOUSE"
SINGLE_FAMILY = "SINGLE_FAMILY" SINGLE_FAMILY = "SINGLE_FAMILY"
MULTI_FAMILY = "MULTI_FAMILY" MULTI_FAMILY = "MULTI_FAMILY"
MANUFACTURED = "MANUFACTURED" MANUFACTURED = "MANUFACTURED"
NEW_CONSTRUCTION = "NEW_CONSTRUCTION"
APARTMENT = "APARTMENT" APARTMENT = "APARTMENT"
APARTMENTS = "APARTMENTS"
LAND = "LAND" LAND = "LAND"
LOT = "LOT"
OTHER = "OTHER" OTHER = "OTHER"
BLANK = "BLANK"
@classmethod @classmethod
def from_int_code(cls, code): def from_int_code(cls, code):
mapping = { mapping = {
@ -38,47 +51,55 @@ class PropertyType(Enum):
13: cls.SINGLE_FAMILY, 13: cls.SINGLE_FAMILY,
} }
return mapping.get(code, cls.OTHER) return mapping.get(code, cls.BLANK)
@dataclass @dataclass
class Address: class Address:
address_one: str street_address: str
city: str city: str
state: str state: str
zip_code: str zip_code: str
unit: str | None = None
address_two: str | None = None country: str | None = None
@dataclass()
class Realty:
site_name: str
address: Address
url: str
listing_type: ListingType | None = None
@dataclass @dataclass
class Property(Realty): class Property:
property_url: str
site_name: SiteName
listing_type: ListingType
address: Address
property_type: PropertyType | None = None
# house for sale
price: int | None = None price: int | None = None
tax_assessed_value: int | None = None
currency: str | None = None
square_feet: int | None = None
beds: int | None = None beds: int | None = None
baths: float | None = None baths: float | None = None
lot_area_value: float | None = None
lot_area_unit: str | None = None
stories: int | None = None stories: int | None = None
year_built: int | None = None year_built: int | None = None
square_feet: int | None = None price_per_sqft: int | None = None
price_per_square_foot: int | None = None
mls_id: str | None = None mls_id: str | None = None
agent_name: str | None = None agent_name: str | None = None
property_type: PropertyType | None = None img_src: str | None = None
lot_size: int | None = None
description: str | None = None description: str | None = None
status_text: str | None = None
latitude: float | None = None
longitude: float | None = None
posted_time: str | None = None
# building for sale
bldg_name: str | None = None
bldg_unit_count: int | None = None
bldg_min_beds: int | None = None
bldg_min_baths: float | None = None
bldg_min_area: int | None = None
@dataclass # apt
class Building(Realty): apt_min_price: int | None = None
num_units: int | None = None
min_unit_price: int | None = None
max_unit_price: int | None = None
avg_unit_price: int | None = None

View File

@ -3,6 +3,7 @@ from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any, Generator from typing import Any, Generator
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
from ....utils import parse_address_two
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -29,7 +30,7 @@ class RealtorScraper(Scraper):
params = { params = {
"input": self.location, "input": self.location,
"client_id": self.listing_type.value.replace('_', '-'), "client_id": self.listing_type.value.lower().replace("_", "-"),
"limit": "1", "limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
} }
@ -96,46 +97,57 @@ class RealtorScraper(Scraper):
} }
}""" }"""
variables = { variables = {"property_id": property_id}
'property_id': property_id
}
payload = { payload = {
'query': query, "query": query,
'variables': variables, "variables": variables,
} }
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.search_url, json=payload)
response_json = response.json() response_json = response.json()
property_info = response_json['data']['property'] property_info = response_json["data"]["property"]
street_address = property_info["address"]["line"]
unit = parse_address_two(street_address)
return [Property( return [
site_name=self.site_name, Property(
address=Address( site_name=self.site_name,
address_one=property_info['address']['line'], address=Address(
city=property_info['address']['city'], street_address=street_address,
state=property_info['address']['state_code'], city=property_info["address"]["city"],
zip_code=property_info['address']['postal_code'], state=property_info["address"]["state_code"],
), zip_code=property_info["address"]["postal_code"],
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'], unit=unit,
beds=property_info['basic']['beds'], country="USA",
baths=property_info['basic']['baths'], ),
stories=property_info['details']['stories'], property_url="https://www.realtor.com/realestateandhomes-detail/"
year_built=property_info['details']['year_built'], + property_info["details"]["permalink"],
square_feet=property_info['basic']['sqft'], beds=property_info["basic"]["beds"],
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft'] baths=property_info["basic"]["baths"],
if property_info['basic']['sqft'] is not None and stories=property_info["details"]["stories"],
property_info['basic']['price'] is not None year_built=property_info["details"]["year_built"],
else None, square_feet=property_info["basic"]["sqft"],
price=property_info['basic']['price'], price_per_sqft=property_info["basic"]["price"]
mls_id=property_id, // property_info["basic"]["sqft"]
listing_type=self.listing_type, if property_info["basic"]["sqft"] is not None
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None, and property_info["basic"]["price"] is not None
)] else None,
price=property_info["basic"]["price"],
mls_id=property_id,
listing_type=self.listing_type,
lot_area_value=property_info["public_record"]["lot_size"]
if property_info["public_record"] is not None
else None,
)
]
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: def handle_area(
query = """query Home_search( self, variables: dict, return_total: bool = False
) -> list[Property] | int:
query = (
"""query Home_search(
$city: String, $city: String,
$county: [String], $county: [String],
$state_code: String, $state_code: String,
@ -193,42 +205,57 @@ class RealtorScraper(Scraper):
} }
} }
} }
}""" % self.listing_type.value }"""
% self.listing_type.value.lower()
)
payload = { payload = {
'query': query, "query": query,
'variables': variables, "variables": variables,
} }
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.search_url, json=payload)
response.raise_for_status()
response_json = response.json() response_json = response.json()
if return_total: if return_total:
return response_json['data']['home_search']['total'] return response_json["data"]["home_search"]["total"]
properties: list[Property] = [] properties: list[Property] = []
for result in response_json['data']['home_search']['results']: if (
response_json is None
or "data" not in response_json
or response_json["data"] is None
or "home_search" not in response_json["data"]
or response_json["data"]["home_search"] is None
or "results" not in response_json["data"]["home_search"]
):
return []
for result in response_json["data"]["home_search"]["results"]:
realty_property = Property( realty_property = Property(
address=Address( address=Address(
address_one=result['location']['address']['line'], street_address=result["location"]["address"]["line"],
city=result['location']['address']['city'], city=result["location"]["address"]["city"],
state=result['location']['address']['state_code'], state=result["location"]["address"]["state_code"],
zip_code=result['location']['address']['postal_code'], zip_code=result["location"]["address"]["postal_code"],
address_two=result['location']['address']['unit'], unit=result["location"]["address"]["unit"],
country="USA",
), ),
site_name=self.site_name, site_name=self.site_name,
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'], property_url="https://www.realtor.com/realestateandhomes-detail/"
beds=result['description']['beds'], + result["property_id"],
baths=result['description']['baths'], beds=result["description"]["beds"],
stories=result['description']['stories'], baths=result["description"]["baths"],
year_built=result['description']['year_built'], stories=result["description"]["stories"],
square_feet=result['description']['sqft'], year_built=result["description"]["year_built"],
price_per_square_foot=result['price_per_sqft'], square_feet=result["description"]["sqft"],
price=result['list_price'], price_per_sqft=result["price_per_sqft"],
mls_id=result['property_id'], price=result["list_price"],
mls_id=result["property_id"],
listing_type=self.listing_type, listing_type=self.listing_type,
lot_size=result['description']['lot_sqft'], lot_area_value=result["description"]["lot_sqft"],
) )
properties.append(realty_property) properties.append(realty_property)
@ -239,17 +266,17 @@ class RealtorScraper(Scraper):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]
if location_type == 'address': if location_type == "address":
property_id = location_info['mpr_id'] property_id = location_info["mpr_id"]
return self.handle_address(property_id) return self.handle_address(property_id)
offset = 0 offset = 0
search_variables = { search_variables = {
'city': location_info.get('city'), "city": location_info.get("city"),
'county': location_info.get('county'), "county": location_info.get("county"),
'state_code': location_info.get('state_code'), "state_code": location_info.get("state_code"),
'postal_code': location_info.get('postal_code'), "postal_code": location_info.get("postal_code"),
'offset': offset, "offset": offset,
} }
total = self.handle_area(search_variables, return_total=True) total = self.handle_area(search_variables, return_total=True)
@ -258,8 +285,11 @@ class RealtorScraper(Scraper):
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures = [ futures = [
executor.submit( executor.submit(
self.handle_area, variables=search_variables | {'offset': i}, return_total=False self.handle_area,
) for i in range(0, total, 200) variables=search_variables | {"offset": i},
return_total=False,
)
for i in range(0, total, 200)
] ]
for future in as_completed(futures): for future in as_completed(futures):

View File

@ -1,7 +1,8 @@
import json import json
from ..models import Property, Address, PropertyType, Building
from .. import Scraper
from typing import Any from typing import Any
from .. import Scraper
from ....utils import parse_address_two
from ..models import Property, Address, PropertyType
class RedfinScraper(Scraper): class RedfinScraper(Scraper):
@ -38,20 +39,26 @@ class RedfinScraper(Scraper):
return home[key]["value"] return home[key]["value"]
if not single_search: if not single_search:
unit = parse_address_two(get_value("streetLine"))
address = Address( address = Address(
address_one=get_value("streetLine"), street_address=get_value("streetLine"),
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
unit=unit,
country="USA",
) )
else: else:
address_info = home["streetAddress"] address_info = home["streetAddress"]
unit = parse_address_two(address_info["assembledAddress"])
address = Address( address = Address(
address_one=address_info["assembledAddress"], street_address=address_info["assembledAddress"],
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
unit=unit,
country="USA",
) )
url = "https://www.redfin.com{}".format(home["url"]) url = "https://www.redfin.com{}".format(home["url"])
property_type = home["propertyType"] if "propertyType" in home else None property_type = home["propertyType"] if "propertyType" in home else None
@ -69,7 +76,7 @@ class RedfinScraper(Scraper):
site_name=self.site_name, site_name=self.site_name,
listing_type=self.listing_type, listing_type=self.listing_type,
address=address, address=address,
url=url, property_url=url,
beds=home["beds"] if "beds" in home else None, beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None, baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None, stories=home["stories"] if "stories" in home else None,
@ -79,41 +86,41 @@ class RedfinScraper(Scraper):
if not single_search if not single_search
else home["yearBuilt"], else home["yearBuilt"],
square_feet=get_value("sqFt"), square_feet=get_value("sqFt"),
lot_size=lot_size, lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")), property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_square_foot=get_value("pricePerSqFt"), price_per_sqft=get_value("pricePerSqFt"),
price=get_value("price"), price=get_value("price"),
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
) )
def _parse_building(self, building: dict) -> Building: def _parse_building(self, building: dict) -> Property:
return Building( return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address( address=Address(
address_one=" ".join( street_address=" ".join(
[ [
building['address']['streetNumber'], building["address"]["streetNumber"],
building['address']['directionalPrefix'], building["address"]["directionalPrefix"],
building['address']['streetName'], building["address"]["streetName"],
building['address']['streetType'], building["address"]["streetType"],
] ]
), ),
city=building['address']['city'], city=building["address"]["city"],
state=building['address']['stateOrProvinceCode'], state=building["address"]["stateOrProvinceCode"],
zip_code=building['address']['postalCode'], zip_code=building["address"]["postalCode"],
address_two=" ".join( unit=" ".join(
[ [
building['address']['unitType'], building["address"]["unitType"],
building['address']['unitValue'], building["address"]["unitValue"],
] ]
) ),
), ),
site_name=self.site_name, property_url="https://www.redfin.com{}".format(building["url"]),
url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type, listing_type=self.listing_type,
num_units=building["numUnitsForSale"], bldg_unit_count=building["numUnitsForSale"],
) )
def handle_address(self, home_id: str): def handle_address(self, home_id: str):
""" """
EPs: EPs:
@ -152,7 +159,8 @@ class RedfinScraper(Scraper):
homes = [ homes = [
self._parse_home(home) for home in response_json["payload"]["homes"] self._parse_home(home) for home in response_json["payload"]["homes"]
] + [ ] + [
self._parse_building(building) for building in response_json["payload"]["buildings"].values() self._parse_building(building)
for building in response_json["payload"]["buildings"].values()
] ]
return homes return homes

View File

@ -1,8 +1,9 @@
import re import re
import json import json
from ..models import Property, Address, Building, ListingType, PropertyType
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two
from ....exceptions import NoResultsFound, PropertyNotFound
from ..models import Property, Address, ListingType, PropertyType, SiteName
class ZillowScraper(Scraper): class ZillowScraper(Scraper):
@ -13,6 +14,8 @@ class ZillowScraper(Scraper):
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/" self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
elif self.listing_type == ListingType.FOR_RENT: elif self.listing_type == ListingType.FOR_RENT:
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/" self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
else:
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
def search(self): def search(self):
resp = self.session.get(self.url, headers=self._get_headers()) resp = self.session.get(self.url, headers=self._get_headers())
@ -33,10 +36,17 @@ class ZillowScraper(Scraper):
data = json.loads(json_str) data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]: if "searchPageState" in data["props"]["pageProps"]:
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][ pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};'
"searchResults"
]["listResults"] match = re.search(pattern, content)
return [self._parse_home(house) for house in houses]
if match:
coords = [float(coord) for coord in match.groups()]
return self._fetch_properties_backend(coords)
else:
raise BoxBoundsNotFound("Box bounds could not be located.")
elif "gdpClientCache" in data["props"]["pageProps"]: elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"]) gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0] main_key = list(gdp_client_cache.keys())[0]
@ -47,45 +57,177 @@ class ZillowScraper(Scraper):
return [property] return [property]
raise PropertyNotFound("Specific property data not found in the response.") raise PropertyNotFound("Specific property data not found in the response.")
def _parse_home(self, home: dict): def _fetch_properties_backend(self, coords):
""" url = "https://www.zillow.com/async-create-search-page-state"
This method is used when a user enters a generic location & zillow returns more than one property
""" filter_state_for_sale = {
url = ( "sortSelection": {
f"https://www.zillow.com{home['detailUrl']}" # "value": "globalrelevanceex"
if "zillow.com" not in home["detailUrl"] "value": "days"
else home["detailUrl"] },
"isAllHomes": {"value": True},
}
filter_state_for_rent = {
"isForRent": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
filter_state_sold = {
"isRecentlySold": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
selected_filter = (
filter_state_for_rent
if self.listing_type == ListingType.FOR_RENT
else filter_state_for_sale
if self.listing_type == ListingType.FOR_SALE
else filter_state_sold
) )
if "hdpData" in home and "homeInfo" in home["hdpData"]: payload = {
price_data = self._extract_price(home) "searchQueryState": {
address = self._extract_address(home) "pagination": {},
agent_name = self._extract_agent_name(home) "isMapVisible": True,
beds = home["hdpData"]["homeInfo"]["bedrooms"] "mapBounds": {
baths = home["hdpData"]["homeInfo"]["bathrooms"] "west": coords[0],
property_type = home["hdpData"]["homeInfo"].get("homeType") "east": coords[1],
"south": coords[2],
"north": coords[3],
},
"filterState": selected_filter,
"isListVisible": True,
"mapZoom": 11,
},
"wants": {"cat1": ["mapResults"]},
"isDebugRequest": False,
}
resp = self.session.put(url, headers=self._get_headers(), json=payload)
resp.raise_for_status()
a = resp.json()
return self._parse_properties(resp.json())
return Property( def _parse_properties(self, property_data: dict):
site_name=self.site_name, mapresults = property_data["cat1"]["searchResults"]["mapResults"]
address=address,
agent_name=agent_name,
url=url,
beds=beds,
baths=baths,
listing_type=self.listing_type,
property_type=PropertyType(property_type),
**price_data,
)
else:
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
address_one, city, state, zip_code = (home[key] for key in keys)
address_one, address_two = self._parse_address_two(address_one)
address = Address(address_one, city, state, zip_code, address_two)
building_info = self._extract_building_info(home) properties_list = []
return Building(
site_name=self.site_name, address=address, url=url, **building_info for result in mapresults:
if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"]
address_data = {
"street_address": home_info["streetAddress"],
"unit": home_info.get("unit"),
"city": home_info["city"],
"state": home_info["state"],
"zip_code": home_info["zipcode"],
"country": home_info["country"],
}
property_data = {
"site_name": self.site_name,
"address": Address(**address_data),
"property_url": f"https://www.zillow.com{result['detailUrl']}",
"beds": int(home_info["bedrooms"])
if "bedrooms" in home_info
else None,
"baths": home_info.get("bathrooms"),
"square_feet": int(home_info["livingArea"])
if "livingArea" in home_info
else None,
"currency": home_info["currency"],
"price": home_info.get("price"),
"square_feet": int(home_info["livingArea"])
if "livingArea" in home_info
else None,
"tax_assessed_value": int(home_info["taxAssessedValue"])
if "taxAssessedValue" in home_info
else None,
"property_type": PropertyType(home_info["homeType"]),
"listing_type": ListingType(
home_info["statusType"]
if "statusType" in home_info
else self.listing_type
),
"lot_area_value": round(home_info["lotAreaValue"], 2)
if "lotAreaValue" in home_info
else None,
"lot_area_unit": home_info.get("lotAreaUnit"),
"latitude": result["latLong"]["latitude"],
"longitude": result["latLong"]["longitude"],
"status_text": result.get("statusText"),
"posted_time": result["variableData"]["text"]
if "variableData" in result
and "text" in result["variableData"]
and result["variableData"]["type"] == "TIME_ON_INFO"
else None,
"img_src": result.get("imgSrc"),
"price_per_sqft": int(home_info["price"] // home_info["livingArea"])
if "livingArea" in home_info and "price" in home_info
else None,
}
property_obj = Property(**property_data)
properties_list.append(property_obj)
elif "isBuilding" in result:
price = result["price"]
building_data = {
"property_url": f"https://www.zillow.com{result['detailUrl']}",
"site_name": self.site_name,
"property_type": PropertyType("BUILDING"),
"listing_type": ListingType(result["statusType"]),
"img_src": result["imgSrc"],
"price": int(price.replace("From $", "").replace(",", ""))
if "From $" in price
else None,
"apt_min_price": int(
price.replace("$", "").replace(",", "").replace("+/mo", "")
)
if "+/mo" in price
else None,
"address": self._extract_address(result["address"]),
"bldg_min_beds": result["minBeds"],
"currency": "USD",
"bldg_min_baths": result["minBaths"],
"bldg_min_area": result.get("minArea"),
"bldg_unit_count": result["unitCount"],
"bldg_name": result.get("communityName"),
"status_text": result["statusText"],
"latitude": result["latLong"]["latitude"],
"longitude": result["latLong"]["longitude"],
}
building_obj = Property(**building_data)
properties_list.append(building_obj)
return properties_list
def _extract_units(self, result: dict):
units = {}
if "units" in result:
num_units = result.get("availabilityCount", len(result["units"]))
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in result["units"]
]
units["apt_availability_count"] = num_units
units["apt_min_unit_price"] = min(prices)
units["apt_max_unit_price"] = max(prices)
units["apt_avg_unit_price"] = (
sum(prices) // num_units if num_units else None
) )
return units
def _get_single_property_page(self, property_data: dict): def _get_single_property_page(self, property_data: dict):
""" """
@ -97,32 +239,38 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
address_one, address_two = self._parse_address_two( unit = parse_address_two(address_data["streetAddress"])
address_data["streetAddress"]
)
address = Address( address = Address(
address_one=address_one, street_address=address_data["streetAddress"],
address_two=address_two, unit=unit,
city=address_data["city"], city=address_data["city"],
state=address_data["state"], state=address_data["state"],
zip_code=address_data["zipcode"], zip_code=address_data["zipcode"],
country=property_data.get("country"),
) )
property_type = property_data.get("homeType", None) property_type = property_data.get("homeType", None)
return Property( return Property(
site_name=self.site_name, site_name=self.site_name,
address=address, address=address,
url=url, property_url=url,
beds=property_data.get("bedrooms", None), beds=property_data.get("bedrooms", None),
baths=property_data.get("bathrooms", None), baths=property_data.get("bathrooms", None),
year_built=property_data.get("yearBuilt", None), year_built=property_data.get("yearBuilt", None),
price=property_data.get("price", None), price=property_data.get("price", None),
lot_size=property_data.get("lotSize", None), tax_assessed_value=property_data.get("taxAssessedValue", None),
latitude=property_data.get("latitude"),
longitude=property_data.get("longitude"),
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
currency=property_data.get("currency", None),
lot_area_value=property_data.get("lotAreaValue"),
lot_area_unit=property_data["lotAreaUnits"].lower()
if "lotAreaUnits" in property_data
else None,
agent_name=property_data.get("attributionInfo", {}).get("agentName", None), agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
stories=property_data.get("resoFacts", {}).get("stories", None), stories=property_data.get("resoFacts", {}).get("stories", None),
description=property_data.get("description", None), description=property_data.get("description", None),
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None), mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
price_per_square_foot=property_data.get("resoFacts", {}).get( price_per_sqft=property_data.get("resoFacts", {}).get(
"pricePerSquareFoot", None "pricePerSquareFoot", None
), ),
square_feet=property_data.get("livingArea", None), square_feet=property_data.get("livingArea", None),
@ -130,81 +278,54 @@ class ZillowScraper(Scraper):
listing_type=self.listing_type, listing_type=self.listing_type,
) )
def _extract_building_info(self, home: dict) -> dict: def _extract_address(self, address_str):
num_units = len(home["units"]) """
prices = [ Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) and return an Address object.
for unit in home["units"] """
] parts = address_str.split(", ")
return {
"listing_type": self.listing_type,
"num_units": len(home["units"]),
"min_unit_price": min(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"max_unit_price": max(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
}
@staticmethod if len(parts) != 3:
def _extract_price(home: dict) -> dict: raise ValueError(f"Unexpected address format: {address_str}")
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue") street_address = parts[0].strip()
price_per_square_foot = price // square_feet if square_feet and price else None city = parts[1].strip()
state_zip = parts[2].split(" ")
return { if len(state_zip) == 1:
k: v state = state_zip[0].strip()
for k, v in locals().items() zip_code = None
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"] elif len(state_zip) == 2:
} state = state_zip[0].strip()
zip_code = state_zip[1].strip()
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
@staticmethod unit = parse_address_two(street_address)
def _extract_agent_name(home: dict) -> str | None: return Address(
broker_str = home.get("brokerName", "") street_address=street_address,
match = re.search(r"Listing by: (.+)", broker_str) city=city,
return match.group(1) if match else None unit=unit,
state=state,
@staticmethod zip_code=zip_code,
def _parse_address_two(address_one: str): country="USA",
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
address_two = apt_match.group().strip() if apt_match else None
address_one = (
address_one.replace(address_two, "").strip() if address_two else address_one
) )
return address_one, address_two
@staticmethod
def _extract_address(home: dict) -> Address:
keys = ("streetAddress", "city", "state", "zipcode")
address_one, city, state, zip_code = (
home["hdpData"]["homeInfo"][key] for key in keys
)
address_one, address_two = ZillowScraper._parse_address_two(address_one)
return Address(address_one, city, state, zip_code, address_two=address_two)
@staticmethod @staticmethod
def _get_headers(): def _get_headers():
return { return {
"authority": "parser-external.geo.moveaws.com", "authority": "www.zillow.com",
"accept": "*/*", "accept": "*/*",
"accept-language": "en-US,en;q=0.9", "accept-language": "en-US,en;q=0.9",
"content-type": "application/json",
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
"origin": "https://www.zillow.com", "origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/", "referer": "https://www.zillow.com",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0", "sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"', "sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty", "sec-fetch-dest": "empty",
"sec-fetch-mode": "cors", "sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
} }

View File

@ -12,3 +12,7 @@ class NoResultsFound(Exception):
class PropertyNotFound(Exception): class PropertyNotFound(Exception):
"""Raised when no property is found for the given address""" """Raised when no property is found for the given address"""
class BoxBoundsNotFound(Exception):
"""Raised when no property is found for the given address"""

6
homeharvest/utils.py Normal file
View File

@ -0,0 +1,6 @@
import re
def parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
return apt_match.group().strip() if apt_match else None

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.1.3" version = "0.1.4"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@ -3,9 +3,17 @@ from homeharvest import scrape_property
def test_realtor(): def test_realtor():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"), scrape_property(
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format location="2530 Al Lipscomb Way",
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format site_name="realtor.com",
listing_type="for_sale",
),
scrape_property(
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent"
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
), #: does not support "city, state, USA" format
scrape_property(location="85281", site_name="realtor.com"), scrape_property(location="85281", site_name="realtor.com"),
] ]

View File

@ -3,9 +3,15 @@ from homeharvest import scrape_property
def test_redfin(): def test_redfin():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"), location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
scrape_property(location="Dallas, TX, USA", site_name="redfin"), ),
scrape_property(
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
),
scrape_property(location="85281", site_name="redfin"), scrape_property(location="85281", site_name="redfin"),
] ]

View File

@ -3,9 +3,15 @@ from homeharvest import scrape_property
def test_zillow(): def test_zillow():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"), location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
scrape_property(location="Dallas, TX, USA", site_name="zillow"), ),
scrape_property(
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
),
scrape_property(location="85281", site_name="zillow"), scrape_property(location="85281", site_name="zillow"),
] ]