Merge pull request #13 from ZacharyHampton/simplify_fields

fix: simplify fields
pull/14/head
Zachary Hampton 2023-09-19 19:16:18 -07:00 committed by GitHub
commit 66e55173b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 276 additions and 329 deletions

View File

@ -23,9 +23,7 @@ def _validate_input(site_name: str, listing_type: str) -> None:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.") raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
if listing_type.upper() not in ListingType.__members__: if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType( raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
f"Provided listing type, '{listing_type}', does not exist."
)
def _get_ordered_properties(result: Property) -> list[str]: def _get_ordered_properties(result: Property) -> list[str]:
@ -35,34 +33,26 @@ def _get_ordered_properties(result: Property) -> list[str]:
"listing_type", "listing_type",
"property_type", "property_type",
"status_text", "status_text",
"currency", "baths_min",
"price", "baths_max",
"apt_min_price", "beds_min",
"apt_max_price", "beds_max",
"apt_min_sqft", "sqft_min",
"apt_max_sqft", "sqft_max",
"apt_min_beds", "price_min",
"apt_max_beds", "price_max",
"apt_min_baths", "unit_count",
"apt_max_baths",
"tax_assessed_value", "tax_assessed_value",
"square_feet",
"price_per_sqft", "price_per_sqft",
"beds",
"baths",
"lot_area_value", "lot_area_value",
"lot_area_unit", "lot_area_unit",
"street_address", "address_one",
"unit", "address_two",
"city", "city",
"state", "state",
"zip_code", "zip_code",
"country",
"posted_time", "posted_time",
"bldg_min_beds", "area_min",
"bldg_min_baths",
"bldg_min_area",
"bldg_unit_count",
"bldg_name", "bldg_name",
"stories", "stories",
"year_built", "year_built",
@ -86,12 +76,11 @@ def _process_result(result: Property) -> pd.DataFrame:
prop_data["property_type"] = None prop_data["property_type"] = None
if "address" in prop_data: if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["street_address"] = address_data.street_address prop_data["address_one"] = address_data.address_one
prop_data["unit"] = address_data.unit prop_data["address_two"] = address_data.address_two
prop_data["city"] = address_data.city prop_data["city"] = address_data.city
prop_data["state"] = address_data.state prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code prop_data["zip_code"] = address_data.zip_code
prop_data["country"] = address_data.country
del prop_data["address"] del prop_data["address"]
@ -101,9 +90,7 @@ def _process_result(result: Property) -> pd.DataFrame:
return properties_df return properties_df
def _scrape_single_site( def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame:
location: str, site_name: str, listing_type: str, proxy: str = None
) -> pd.DataFrame:
""" """
Helper function to scrape a single site. Helper function to scrape a single site.
""" """
@ -120,9 +107,7 @@ def _scrape_single_site(
results = site.search() results = site.search()
properties_dfs = [_process_result(result) for result in results] properties_dfs = [_process_result(result) for result in results]
properties_dfs = [ properties_dfs = [df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty]
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
@ -158,9 +143,7 @@ def scrape_property(
else: else:
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
futures = { futures = {
executor.submit( executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name
_scrape_single_site, location, s_name, listing_type, proxy
): s_name
for s_name in site_name for s_name in site_name
} }
@ -175,14 +158,12 @@ def scrape_property(
final_df = pd.concat(results, ignore_index=True) final_df = pd.concat(results, ignore_index=True)
columns_to_track = ["street_address", "city", "unit"] columns_to_track = ["address_one", "address_two", "city"]
#: validate they exist, otherwise create them #: validate they exist, otherwise create them
for col in columns_to_track: for col in columns_to_track:
if col not in final_df.columns: if col not in final_df.columns:
final_df[col] = None final_df[col] = None
final_df = final_df.drop_duplicates( final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
subset=["street_address", "city", "unit"], keep="first"
)
return final_df return final_df

View File

@ -5,9 +5,7 @@ from homeharvest import scrape_property
def main(): def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper") parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument( parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
)
parser.add_argument( parser.add_argument(
"-s", "-s",
@ -44,15 +42,11 @@ def main():
help="Name of the output file (without extension)", help="Name of the output file (without extension)",
) )
parser.add_argument( parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
)
args = parser.parse_args() args = parser.parse_args()
result = scrape_property( result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy)
args.location, args.site_name, args.listing_type, proxy=args.proxy
)
if not args.filename: if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

View File

@ -19,10 +19,7 @@ class Scraper:
self.session = requests.Session() self.session = requests.Session()
if scraper_input.proxy: if scraper_input.proxy:
proxy_url = scraper_input.proxy proxy_url = scraper_input.proxy
proxies = { proxies = {"http": proxy_url, "https": proxy_url}
"http": proxy_url,
"https": proxy_url
}
self.session.proxies.update(proxies) self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name

View File

@ -1,5 +1,6 @@
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Tuple
class SiteName(Enum): class SiteName(Enum):
@ -56,12 +57,11 @@ class PropertyType(Enum):
@dataclass @dataclass
class Address: class Address:
street_address: str address_one: str | None = None
city: str address_two: str | None = "#"
state: str city: str | None = None
zip_code: str state: str | None = None
unit: str | None = None zip_code: str | None = None
country: str | None = None
@dataclass @dataclass
@ -73,12 +73,7 @@ class Property:
property_type: PropertyType | None = None property_type: PropertyType | None = None
# house for sale # house for sale
price: int | None = None
tax_assessed_value: int | None = None tax_assessed_value: int | None = None
currency: str | None = None
square_feet: int | None = None
beds: int | None = None
baths: float | None = None
lot_area_value: float | None = None lot_area_value: float | None = None
lot_area_unit: str | None = None lot_area_unit: str | None = None
stories: int | None = None stories: int | None = None
@ -90,23 +85,25 @@ class Property:
img_src: str | None = None img_src: str | None = None
description: str | None = None description: str | None = None
status_text: str | None = None status_text: str | None = None
latitude: float | None = None
longitude: float | None = None
posted_time: str | None = None posted_time: str | None = None
# building for sale # building for sale
bldg_name: str | None = None bldg_name: str | None = None
bldg_unit_count: int | None = None area_min: int | None = None
bldg_min_beds: int | None = None
bldg_min_baths: float | None = None
bldg_min_area: int | None = None
# apt beds_min: int | None = None
apt_min_beds: int | None = None beds_max: int | None = None
apt_max_beds: int | None = None
apt_min_baths: float | None = None baths_min: float | None = None
apt_max_baths: float | None = None baths_max: float | None = None
apt_min_price: int | None = None
apt_max_price: int | None = None sqft_min: int | None = None
apt_min_sqft: int | None = None sqft_max: int | None = None
apt_max_sqft: int | None = None
price_min: int | None = None
price_max: int | None = None
unit_count: int | None = None
latitude: float | None = None
longitude: float | None = None

View File

@ -1,16 +1,23 @@
import json """
homeharvest.realtor.__init__
~~~~~~~~~~~~
This module implements the scraper for relator.com
"""
from ..models import Property, Address from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any, Generator
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
from ....utils import parse_address_two, parse_unit from ....utils import parse_address_one, parse_address_two
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
self.counter = 1
super().__init__(scraper_input) super().__init__(scraper_input)
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta" self.search_url = (
"https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
)
def handle_location(self): def handle_location(self):
headers = { headers = {
@ -50,6 +57,9 @@ class RealtorScraper(Scraper):
return result[0] return result[0]
def handle_address(self, property_id: str) -> list[Property]: def handle_address(self, property_id: str) -> list[Property]:
"""
Handles a specific address & returns one property
"""
query = """query Property($property_id: ID!) { query = """query Property($property_id: ID!) {
property(id: $property_id) { property(id: $property_id) {
property_id property_id
@ -108,43 +118,45 @@ class RealtorScraper(Scraper):
response_json = response.json() response_json = response.json()
property_info = response_json["data"]["property"] property_info = response_json["data"]["property"]
street_address, unit = parse_address_two(property_info["address"]["line"]) address_one, address_two = parse_address_one(property_info["address"]["line"])
return [ return [
Property( Property(
site_name=self.site_name, site_name=self.site_name,
address=Address( address=Address(
street_address=street_address, address_one=address_one,
address_two=address_two,
city=property_info["address"]["city"], city=property_info["address"]["city"],
state=property_info["address"]["state_code"], state=property_info["address"]["state_code"],
zip_code=property_info["address"]["postal_code"], zip_code=property_info["address"]["postal_code"],
unit=unit,
country="USA",
), ),
property_url="https://www.realtor.com/realestateandhomes-detail/" property_url="https://www.realtor.com/realestateandhomes-detail/"
+ property_info["details"]["permalink"], + property_info["details"]["permalink"],
beds=property_info["basic"]["beds"],
baths=property_info["basic"]["baths"],
stories=property_info["details"]["stories"], stories=property_info["details"]["stories"],
year_built=property_info["details"]["year_built"], year_built=property_info["details"]["year_built"],
square_feet=property_info["basic"]["sqft"], price_per_sqft=property_info["basic"]["price"] // property_info["basic"]["sqft"]
price_per_sqft=property_info["basic"]["price"] if property_info["basic"]["sqft"] is not None and property_info["basic"]["price"] is not None
// property_info["basic"]["sqft"]
if property_info["basic"]["sqft"] is not None
and property_info["basic"]["price"] is not None
else None, else None,
price=property_info["basic"]["price"],
mls_id=property_id, mls_id=property_id,
listing_type=self.listing_type, listing_type=self.listing_type,
lot_area_value=property_info["public_record"]["lot_size"] lot_area_value=property_info["public_record"]["lot_size"]
if property_info["public_record"] is not None if property_info["public_record"] is not None
else None, else None,
beds_min=property_info["basic"]["beds"],
beds_max=property_info["basic"]["beds"],
baths_min=property_info["basic"]["baths"],
baths_max=property_info["basic"]["baths"],
sqft_min=property_info["basic"]["sqft"],
sqft_max=property_info["basic"]["sqft"],
price_min=property_info["basic"]["price"],
price_max=property_info["basic"]["price"],
) )
] ]
def handle_area( def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
self, variables: dict, return_total: bool = False """
) -> list[Property] | int: Handles a location area & returns a list of properties
"""
query = ( query = (
"""query Home_search( """query Home_search(
$city: String, $city: String,
@ -237,17 +249,15 @@ class RealtorScraper(Scraper):
return [] return []
for result in response_json["data"]["home_search"]["results"]: for result in response_json["data"]["home_search"]["results"]:
street_address, unit = parse_address_two( self.counter += 1
result["location"]["address"]["line"] address_one, _ = parse_address_one(result["location"]["address"]["line"])
)
realty_property = Property( realty_property = Property(
address=Address( address=Address(
street_address=street_address, address_one=address_one,
city=result["location"]["address"]["city"], city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"], state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"], zip_code=result["location"]["address"]["postal_code"],
unit=parse_unit(result["location"]["address"]["unit"]), address_two=parse_address_two(result["location"]["address"]["unit"]),
country="USA",
), ),
latitude=result["location"]["address"]["coordinate"]["lat"] latitude=result["location"]["address"]["coordinate"]["lat"]
if result if result
@ -264,20 +274,22 @@ class RealtorScraper(Scraper):
and "lon" in result["location"]["address"]["coordinate"] and "lon" in result["location"]["address"]["coordinate"]
else None, else None,
site_name=self.site_name, site_name=self.site_name,
property_url="https://www.realtor.com/realestateandhomes-detail/" property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"],
+ result["property_id"],
beds=result["description"]["beds"],
baths=result["description"]["baths"],
stories=result["description"]["stories"], stories=result["description"]["stories"],
year_built=result["description"]["year_built"], year_built=result["description"]["year_built"],
square_feet=result["description"]["sqft"],
price_per_sqft=result["price_per_sqft"], price_per_sqft=result["price_per_sqft"],
price=result["list_price"],
mls_id=result["property_id"], mls_id=result["property_id"],
listing_type=self.listing_type, listing_type=self.listing_type,
lot_area_value=result["description"]["lot_sqft"], lot_area_value=result["description"]["lot_sqft"],
beds_min=result["description"]["beds"],
beds_max=result["description"]["beds"],
baths_min=result["description"]["baths"],
baths_max=result["description"]["baths"],
sqft_min=result["description"]["sqft"],
sqft_max=result["description"]["sqft"],
price_min=result["list_price"],
price_max=result["list_price"],
) )
properties.append(realty_property) properties.append(realty_property)
return properties return properties

View File

@ -1,7 +1,13 @@
"""
homeharvest.redfin.__init__
~~~~~~~~~~~~
This module implements the scraper for redfin.com
"""
import json import json
from typing import Any from typing import Any
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two, parse_unit from ....utils import parse_address_two, parse_address_one
from ..models import Property, Address, PropertyType, ListingType, SiteName from ..models import Property, Address, PropertyType, ListingType, SiteName
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
@ -12,9 +18,7 @@ class RedfinScraper(Scraper):
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
def _handle_location(self): def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format( url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(self.location)
self.location
)
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", "")) response_json = json.loads(response.text.replace("{}&&", ""))
@ -28,9 +32,7 @@ class RedfinScraper(Scraper):
return "address" #: address, needs to be handled differently return "address" #: address, needs to be handled differently
if "exactMatch" not in response_json["payload"]: if "exactMatch" not in response_json["payload"]:
raise NoResultsFound( raise NoResultsFound("No results found for location: {}".format(self.location))
"No results found for location: {}".format(self.location)
)
if response_json["payload"]["exactMatch"] is not None: if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"] target = response_json["payload"]["exactMatch"]
@ -45,39 +47,30 @@ class RedfinScraper(Scraper):
return home[key]["value"] return home[key]["value"]
if not single_search: if not single_search:
street_address, unit = parse_address_two(get_value("streetLine"))
unit = parse_unit(get_value("streetLine"))
address = Address( address = Address(
street_address=street_address, address_one=parse_address_one(get_value("streetLine"))[0],
address_two=parse_address_one(get_value("streetLine"))[1],
city=home.get("city"), city=home.get("city"),
state=home.get("state"), state=home.get("state"),
zip_code=home.get("zip"), zip_code=home.get("zip"),
unit=unit,
country="USA",
) )
else: else:
address_info = home.get("streetAddress") address_info = home.get("streetAddress")
street_address, unit = parse_address_two(address_info.get("assembledAddress")) address_one, address_two = parse_address_one(address_info.get("assembledAddress"))
address = Address( address = Address(
street_address=street_address, address_one=address_one,
address_two=address_two,
city=home.get("city"), city=home.get("city"),
state=home.get("state"), state=home.get("state"),
zip_code=home.get("zip"), zip_code=home.get("zip"),
unit=unit,
country="USA",
) )
url = "https://www.redfin.com{}".format(home["url"]) url = "https://www.redfin.com{}".format(home["url"])
#: property_type = home["propertyType"] if "propertyType" in home else None
lot_size_data = home.get("lotSize") lot_size_data = home.get("lotSize")
if not isinstance(lot_size_data, int): if not isinstance(lot_size_data, int):
lot_size = ( lot_size = lot_size_data.get("value", None) if isinstance(lot_size_data, dict) else None
lot_size_data.get("value", None)
if isinstance(lot_size_data, dict)
else None
)
else: else:
lot_size = lot_size_data lot_size = lot_size_data
@ -86,26 +79,24 @@ class RedfinScraper(Scraper):
listing_type=self.listing_type, listing_type=self.listing_type,
address=address, address=address,
property_url=url, property_url=url,
beds=home["beds"] if "beds" in home else None, beds_min=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None, beds_max=home["beds"] if "beds" in home else None,
baths_min=home["baths"] if "baths" in home else None,
baths_max=home["baths"] if "baths" in home else None,
price_min=get_value("price"),
price_max=get_value("price"),
sqft_min=get_value("sqFt"),
sqft_max=get_value("sqFt"),
stories=home["stories"] if "stories" in home else None, stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"), agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None, description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt") year_built=get_value("yearBuilt") if not single_search else home["yearBuilt"],
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
lot_area_value=lot_size, lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")), property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_sqft=get_value("pricePerSqFt"), price_per_sqft=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
latitude=home["latLong"]["latitude"] latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
if "latLong" in home and "latitude" in home["latLong"] longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None,
else None,
longitude=home["latLong"]["longitude"]
if "latLong" in home and "longitude" in home["latLong"]
else None,
) )
def _handle_rentals(self, region_id, region_type): def _handle_rentals(self, region_id, region_type):
@ -125,12 +116,10 @@ class RedfinScraper(Scraper):
address_info = home_data.get("addressInfo", {}) address_info = home_data.get("addressInfo", {})
centroid = address_info.get("centroid", {}).get("centroid", {}) centroid = address_info.get("centroid", {}).get("centroid", {})
address = Address( address = Address(
street_address=address_info.get("formattedStreetLine", None), address_one=parse_address_one(address_info.get("formattedStreetLine"))[0],
city=address_info.get("city", None), city=address_info.get("city"),
state=address_info.get("state", None), state=address_info.get("state"),
zip_code=address_info.get("zip", None), zip_code=address_info.get("zip"),
unit=None,
country="US" if address_info.get("countryCode", None) == 1 else None,
) )
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None}) price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
@ -143,20 +132,20 @@ class RedfinScraper(Scraper):
site_name=SiteName.REDFIN, site_name=SiteName.REDFIN,
listing_type=ListingType.FOR_RENT, listing_type=ListingType.FOR_RENT,
address=address, address=address,
apt_min_beds=bed_range.get("min", None), description=rental_data.get("description"),
apt_min_baths=bath_range.get("min", None), latitude=centroid.get("latitude"),
apt_max_beds=bed_range.get("max", None), longitude=centroid.get("longitude"),
apt_max_baths=bath_range.get("max", None), baths_min=bath_range.get("min"),
description=rental_data.get("description", None), baths_max=bath_range.get("max"),
latitude=centroid.get("latitude", None), beds_min=bed_range.get("min"),
longitude=centroid.get("longitude", None), beds_max=bed_range.get("max"),
apt_min_price=price_range.get("min", None), price_min=price_range.get("min"),
apt_max_price=price_range.get("max", None), price_max=price_range.get("max"),
apt_min_sqft=sqft_range.get("min", None), sqft_min=sqft_range.get("min"),
apt_max_sqft=sqft_range.get("max", None), sqft_max=sqft_range.get("max"),
img_src=home_data.get("staticMapUrl", None), img_src=home_data.get("staticMapUrl"),
posted_time=rental_data.get("lastUpdated", None), posted_time=rental_data.get("lastUpdated"),
bldg_name=rental_data.get("propertyName", None), bldg_name=rental_data.get("propertyName"),
) )
properties_list.append(property_) properties_list.append(property_)
@ -175,16 +164,15 @@ class RedfinScraper(Scraper):
building["address"]["streetType"], building["address"]["streetType"],
] ]
) )
street_address, unit = parse_address_two(street_address)
return Property( return Property(
site_name=self.site_name, site_name=self.site_name,
property_type=PropertyType("BUILDING"), property_type=PropertyType("BUILDING"),
address=Address( address=Address(
street_address=street_address, address_one=parse_address_one(street_address)[0],
city=building["address"]["city"], city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"], state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"], zip_code=building["address"]["postalCode"],
unit=parse_unit( address_two=parse_address_two(
" ".join( " ".join(
[ [
building["address"]["unitType"], building["address"]["unitType"],
@ -195,7 +183,7 @@ class RedfinScraper(Scraper):
), ),
property_url="https://www.redfin.com{}".format(building["url"]), property_url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type, listing_type=self.listing_type,
bldg_unit_count=building["numUnitsForSale"], unit_count=building["numUnitsForSale"],
) )
def handle_address(self, home_id: str): def handle_address(self, home_id: str):
@ -206,7 +194,6 @@ class RedfinScraper(Scraper):
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3 https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3 https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
""" """
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format( url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id home_id
) )
@ -214,9 +201,7 @@ class RedfinScraper(Scraper):
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", "")) response_json = json.loads(response.text.replace("{}&&", ""))
parsed_home = self._parse_home( parsed_home = self._parse_home(response_json["payload"]["addressSectionInfo"], single_search=True)
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home] return [parsed_home]
def search(self): def search(self):
@ -235,10 +220,7 @@ class RedfinScraper(Scraper):
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000" url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", "")) response_json = json.loads(response.text.replace("{}&&", ""))
homes = [ homes = [self._parse_home(home) for home in response_json["payload"]["homes"]] + [
self._parse_home(home) for home in response_json["payload"]["homes"] self._parse_building(building) for building in response_json["payload"]["buildings"].values()
] + [
self._parse_building(building)
for building in response_json["payload"]["buildings"].values()
] ]
return homes return homes

View File

@ -1,7 +1,13 @@
"""
homeharvest.zillow.__init__
~~~~~~~~~~~~
This module implements the scraper for zillow.com
"""
import re import re
import json import json
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two, parse_unit from ....utils import parse_address_one, parse_address_two
from ....exceptions import GeoCoordsNotFound, NoResultsFound from ....exceptions import GeoCoordsNotFound, NoResultsFound
from ..models import Property, Address, ListingType, PropertyType from ..models import Property, Address, ListingType, PropertyType
@ -13,12 +19,13 @@ class ZillowScraper(Scraper):
if not self.is_plausible_location(self.location): if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location)) raise NoResultsFound("Invalid location input: {}".format(self.location))
if self.listing_type == ListingType.FOR_SALE: listing_type_to_url_path = {
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/" ListingType.FOR_SALE: "for_sale",
elif self.listing_type == ListingType.FOR_RENT: ListingType.FOR_RENT: "for_rent",
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/" ListingType.SOLD: "recently_sold",
else: }
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
self.url = f"https://www.zillow.com/homes/{listing_type_to_url_path[self.listing_type]}/{self.location}_rb/"
def is_plausible_location(self, location: str) -> bool: def is_plausible_location(self, location: str) -> bool:
url = ( url = (
@ -31,9 +38,7 @@ class ZillowScraper(Scraper):
return response.json()["results"] != [] return response.json()["results"] != []
def search(self): def search(self):
resp = self.session.get( resp = self.session.get(self.url, headers=self._get_headers())
self.url, headers=self._get_headers()
)
resp.raise_for_status() resp.raise_for_status()
content = resp.text content = resp.text
@ -43,9 +48,7 @@ class ZillowScraper(Scraper):
re.DOTALL, re.DOTALL,
) )
if not match: if not match:
raise NoResultsFound( raise NoResultsFound("No results were found for Zillow with the given Location.")
"No results were found for Zillow with the given Location."
)
json_str = match.group(1) json_str = match.group(1)
data = json.loads(json_str) data = json.loads(json_str)
@ -130,9 +133,7 @@ class ZillowScraper(Scraper):
"wants": {"cat1": ["mapResults"]}, "wants": {"cat1": ["mapResults"]},
"isDebugRequest": False, "isDebugRequest": False,
} }
resp = self.session.put( resp = self.session.put(url, headers=self._get_headers(), json=payload)
url, headers=self._get_headers(), json=payload
)
resp.raise_for_status() resp.raise_for_status()
a = resp.json() a = resp.json()
return self._parse_properties(resp.json()) return self._parse_properties(resp.json())
@ -146,87 +147,71 @@ class ZillowScraper(Scraper):
if "hdpData" in result: if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"] home_info = result["hdpData"]["homeInfo"]
address_data = { address_data = {
"street_address": parse_address_two(home_info["streetAddress"])[0], "address_one": parse_address_one(home_info["streetAddress"])[0],
"unit": parse_unit(home_info["unit"]) "address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#",
if "unit" in home_info
else None,
"city": home_info["city"], "city": home_info["city"],
"state": home_info["state"], "state": home_info["state"],
"zip_code": home_info["zipcode"], "zip_code": home_info["zipcode"],
"country": home_info["country"],
} }
property_data = { property_obj = Property(
"site_name": self.site_name, site_name=self.site_name,
"address": Address(**address_data), address=Address(**address_data),
"property_url": f"https://www.zillow.com{result['detailUrl']}", property_url=f"https://www.zillow.com{result['detailUrl']}",
"beds": int(home_info["bedrooms"]) tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None,
if "bedrooms" in home_info property_type=PropertyType(home_info["homeType"]),
else None, listing_type=ListingType(
"baths": home_info.get("bathrooms"), home_info["statusType"] if "statusType" in home_info else self.listing_type
"square_feet": int(home_info["livingArea"])
if "livingArea" in home_info
else None,
"currency": home_info["currency"],
"price": home_info.get("price"),
"tax_assessed_value": int(home_info["taxAssessedValue"])
if "taxAssessedValue" in home_info
else None,
"property_type": PropertyType(home_info["homeType"]),
"listing_type": ListingType(
home_info["statusType"]
if "statusType" in home_info
else self.listing_type
), ),
"lot_area_value": round(home_info["lotAreaValue"], 2) status_text=result.get("statusText"),
if "lotAreaValue" in home_info posted_time=result["variableData"]["text"]
else None,
"lot_area_unit": home_info.get("lotAreaUnit"),
"latitude": result["latLong"]["latitude"],
"longitude": result["latLong"]["longitude"],
"status_text": result.get("statusText"),
"posted_time": result["variableData"]["text"]
if "variableData" in result if "variableData" in result
and "text" in result["variableData"] and "text" in result["variableData"]
and result["variableData"]["type"] == "TIME_ON_INFO" and result["variableData"]["type"] == "TIME_ON_INFO"
else None, else None,
"img_src": result.get("imgSrc"), price_min=home_info.get("price"),
"price_per_sqft": int(home_info["price"] // home_info["livingArea"]) price_max=home_info.get("price"),
if "livingArea" in home_info beds_min=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
and home_info["livingArea"] != 0 beds_max=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
and "price" in home_info baths_min=home_info.get("bathrooms"),
baths_max=home_info.get("bathrooms"),
sqft_min=int(home_info["livingArea"]) if "livingArea" in home_info else None,
sqft_max=int(home_info["livingArea"]) if "livingArea" in home_info else None,
price_per_sqft=int(home_info["price"] // home_info["livingArea"])
if "livingArea" in home_info and home_info["livingArea"] != 0 and "price" in home_info
else None, else None,
} latitude=result["latLong"]["latitude"],
property_obj = Property(**property_data) longitude=result["latLong"]["longitude"],
lot_area_value=round(home_info["lotAreaValue"], 2) if "lotAreaValue" in home_info else None,
lot_area_unit=home_info.get("lotAreaUnit"),
img_src=result.get("imgSrc"),
)
properties_list.append(property_obj) properties_list.append(property_obj)
elif "isBuilding" in result: elif "isBuilding" in result:
price = result["price"] price_string = result["price"].replace("$", "").replace(",", "").replace("+/mo", "")
building_data = {
"property_url": f"https://www.zillow.com{result['detailUrl']}", match = re.search(r"(\d+)", price_string)
"site_name": self.site_name, price_value = int(match.group(1)) if match else None
"property_type": PropertyType("BUILDING"), building_obj = Property(
"listing_type": ListingType(result["statusType"]), property_url=f"https://www.zillow.com{result['detailUrl']}",
"img_src": result["imgSrc"], site_name=self.site_name,
"price": int(price.replace("From $", "").replace(",", "")) property_type=PropertyType("BUILDING"),
if "From $" in price listing_type=ListingType(result["statusType"]),
else None, img_src=result["imgSrc"],
"apt_min_price": int( address=self._extract_address(result["address"]),
price.replace("$", "").replace(",", "").replace("+/mo", "") baths_min=result["minBaths"],
) area_min=result.get("minArea"),
if "+/mo" in price bldg_name=result.get("communityName"),
else None, status_text=result["statusText"],
"address": self._extract_address(result["address"]), beds_min=result["minBeds"],
"bldg_min_beds": result["minBeds"], price_min=price_value if "+/mo" in result["price"] else None,
"currency": "USD", price_max=price_value if "+/mo" in result["price"] else None,
"bldg_min_baths": result["minBaths"], latitude=result["latLong"]["latitude"],
"bldg_min_area": result.get("minArea"), longitude=result["latLong"]["longitude"],
"bldg_unit_count": result["unitCount"], unit_count=result["unitCount"],
"bldg_name": result.get("communityName"), )
"status_text": result["statusText"],
"latitude": result["latLong"]["latitude"],
"longitude": result["latLong"]["longitude"],
}
building_obj = Property(**building_data)
properties_list.append(building_obj) properties_list.append(building_obj)
return properties_list return properties_list
@ -241,43 +226,41 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
street_address, unit = parse_address_two(address_data["streetAddress"]) address_one, address_two = parse_address_one(address_data["streetAddress"])
address = Address( address = Address(
street_address=street_address, address_one=address_one,
unit=unit, address_two=address_two if address_two else "#",
city=address_data["city"], city=address_data["city"],
state=address_data["state"], state=address_data["state"],
zip_code=address_data["zipcode"], zip_code=address_data["zipcode"],
country=property_data.get("country"),
) )
property_type = property_data.get("homeType", None) property_type = property_data.get("homeType", None)
return Property( return Property(
site_name=self.site_name, site_name=self.site_name,
address=address,
property_url=url, property_url=url,
beds=property_data.get("bedrooms", None), property_type=PropertyType(property_type),
baths=property_data.get("bathrooms", None), listing_type=self.listing_type,
year_built=property_data.get("yearBuilt", None), address=address,
price=property_data.get("price", None), year_built=property_data.get("yearBuilt"),
tax_assessed_value=property_data.get("taxAssessedValue", None), tax_assessed_value=property_data.get("taxAssessedValue"),
lot_area_value=property_data.get("lotAreaValue"),
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
agent_name=property_data.get("attributionInfo", {}).get("agentName"),
stories=property_data.get("resoFacts", {}).get("stories"),
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
beds_min=property_data.get("bedrooms"),
beds_max=property_data.get("bedrooms"),
baths_min=property_data.get("bathrooms"),
baths_max=property_data.get("bathrooms"),
price_min=property_data.get("price"),
price_max=property_data.get("price"),
sqft_min=property_data.get("livingArea"),
sqft_max=property_data.get("livingArea"),
price_per_sqft=property_data.get("resoFacts", {}).get("pricePerSquareFoot"),
latitude=property_data.get("latitude"), latitude=property_data.get("latitude"),
longitude=property_data.get("longitude"), longitude=property_data.get("longitude"),
img_src=property_data.get("streetViewTileImageUrlMediumAddress"), img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
currency=property_data.get("currency", None), description=property_data.get("description"),
lot_area_value=property_data.get("lotAreaValue"),
lot_area_unit=property_data["lotAreaUnits"].lower()
if "lotAreaUnits" in property_data
else None,
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
stories=property_data.get("resoFacts", {}).get("stories", None),
description=property_data.get("description", None),
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
price_per_sqft=property_data.get("resoFacts", {}).get(
"pricePerSquareFoot", None
),
square_feet=property_data.get("livingArea", None),
property_type=PropertyType(property_type),
listing_type=self.listing_type,
) )
def _extract_address(self, address_str): def _extract_address(self, address_str):
@ -290,7 +273,7 @@ class ZillowScraper(Scraper):
if len(parts) != 3: if len(parts) != 3:
raise ValueError(f"Unexpected address format: {address_str}") raise ValueError(f"Unexpected address format: {address_str}")
street_address = parts[0].strip() address_one = parts[0].strip()
city = parts[1].strip() city = parts[1].strip()
state_zip = parts[2].split(" ") state_zip = parts[2].split(" ")
@ -303,14 +286,13 @@ class ZillowScraper(Scraper):
else: else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}") raise ValueError(f"Unexpected state/zip format in address: {address_str}")
street_address, unit = parse_address_two(street_address) address_one, address_two = parse_address_one(address_one)
return Address( return Address(
street_address=street_address, address_one=address_one,
address_two=address_two if address_two else "#",
city=city, city=city,
unit=unit,
state=state, state=state,
zip_code=zip_code, zip_code=zip_code,
country="USA",
) )
@staticmethod @staticmethod

View File

@ -1,9 +1,9 @@
import re import re
def parse_address_two(street_address: str) -> tuple: def parse_address_one(street_address: str) -> tuple:
if not street_address: if not street_address:
return street_address, None return street_address, "#"
apt_match = re.search( apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$", r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
@ -13,36 +13,26 @@ def parse_address_two(street_address: str) -> tuple:
if apt_match: if apt_match:
apt_str = apt_match.group().strip() apt_str = apt_match.group().strip()
cleaned_apt_str = re.sub( cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I)
r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I
)
main_address = street_address.replace(apt_str, "").strip() main_address = street_address.replace(apt_str, "").strip()
return main_address, cleaned_apt_str return main_address, cleaned_apt_str
else: else:
return street_address, None return street_address, "#"
def parse_unit(street_address: str): def parse_address_two(street_address: str):
if not street_address: if not street_address:
return None return "#"
apt_match = re.search( apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
street_address, street_address,
re.I, re.I,
) )
if apt_match: if apt_match:
apt_str = apt_match.group().strip() apt_str = apt_match.group().strip()
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I) apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I)
return apt_str return apt_str
else: else:
return None return "#"
if __name__ == "__main__":
print(parse_address_two("4303 E Cactus Rd Apt 126"))
print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A"))
print(parse_address_two("1234 Elm Street SuIte 3A"))

View File

@ -9,15 +9,9 @@ from homeharvest.exceptions import (
def test_redfin(): def test_redfin():
results = [ results = [
scrape_property( scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale" scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
), scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
scrape_property(
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
),
scrape_property(location="85281", site_name="redfin"), scrape_property(location="85281", site_name="redfin"),
] ]

24
tests/test_utils.py Normal file
View File

@ -0,0 +1,24 @@
from homeharvest.utils import parse_address_one, parse_address_two
def test_parse_address_one():
test_data = [
("4303 E Cactus Rd Apt 126", ("4303 E Cactus Rd", "#126")),
("1234 Elm Street apt 2B", ("1234 Elm Street", "#2B")),
("1234 Elm Street UNIT 3A", ("1234 Elm Street", "#3A")),
("1234 Elm Street unit 3A", ("1234 Elm Street", "#3A")),
("1234 Elm Street SuIte 3A", ("1234 Elm Street", "#3A")),
]
for input_data, (exp_addr_one, exp_addr_two) in test_data:
address_one, address_two = parse_address_one(input_data)
assert address_one == exp_addr_one
assert address_two == exp_addr_two
def test_parse_address_two():
test_data = [("Apt 126", "#126"), ("apt 2B", "#2B"), ("UNIT 3A", "#3A"), ("unit 3A", "#3A"), ("SuIte 3A", "#3A")]
for input_data, expected in test_data:
output = parse_address_two(input_data)
assert output == expected

View File

@ -9,15 +9,9 @@ from homeharvest.exceptions import (
def test_zillow(): def test_zillow():
results = [ results = [
scrape_property( scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"),
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale" scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"),
), scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"),
scrape_property(
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
),
scrape_property(location="85281", site_name="zillow"), scrape_property(location="85281", site_name="zillow"),
] ]