- cullen merge
parent
088088ae51
commit
29664e4eee
|
@ -4,17 +4,14 @@ import concurrent.futures
|
|||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from .core.scrapers import ScraperInput
|
||||
from .core.scrapers.redfin import RedfinScraper
|
||||
from .utils import process_result, ordered_properties
|
||||
from .core.scrapers.realtor import RealtorScraper
|
||||
from .core.scrapers.zillow import ZillowScraper
|
||||
from .core.scrapers.models import ListingType, Property, SiteName
|
||||
from .exceptions import InvalidSite, InvalidListingType
|
||||
|
||||
|
||||
_scrapers = {
|
||||
"redfin": RedfinScraper,
|
||||
"realtor.com": RealtorScraper,
|
||||
"zillow": ZillowScraper,
|
||||
}
|
||||
|
||||
|
||||
|
@ -26,86 +23,6 @@ def _validate_input(site_name: str, listing_type: str) -> None:
|
|||
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
|
||||
|
||||
|
||||
def _get_ordered_properties(result: Property) -> list[str]:
|
||||
return [
|
||||
"property_url",
|
||||
"site_name",
|
||||
"listing_type",
|
||||
"property_type",
|
||||
"status_text",
|
||||
"baths_min",
|
||||
"baths_max",
|
||||
"beds_min",
|
||||
"beds_max",
|
||||
"sqft_min",
|
||||
"sqft_max",
|
||||
"price_min",
|
||||
"price_max",
|
||||
"unit_count",
|
||||
"tax_assessed_value",
|
||||
"price_per_sqft",
|
||||
"lot_area_value",
|
||||
"lot_area_unit",
|
||||
"address_one",
|
||||
"address_two",
|
||||
"city",
|
||||
"state",
|
||||
"zip_code",
|
||||
"posted_time",
|
||||
"area_min",
|
||||
"bldg_name",
|
||||
"stories",
|
||||
"year_built",
|
||||
"agent_name",
|
||||
"agent_phone",
|
||||
"agent_email",
|
||||
"days_on_market",
|
||||
"sold_date",
|
||||
"mls_id",
|
||||
"img_src",
|
||||
"latitude",
|
||||
"longitude",
|
||||
"description",
|
||||
]
|
||||
|
||||
|
||||
def _process_result(result: Property) -> pd.DataFrame:
|
||||
prop_data = result.__dict__
|
||||
|
||||
prop_data["site_name"] = prop_data["site_name"].value
|
||||
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
|
||||
if "property_type" in prop_data and prop_data["property_type"] is not None:
|
||||
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
||||
else:
|
||||
prop_data["property_type"] = None
|
||||
if "address" in prop_data:
|
||||
address_data = prop_data["address"]
|
||||
prop_data["address_one"] = address_data.address_one
|
||||
prop_data["address_two"] = address_data.address_two
|
||||
prop_data["city"] = address_data.city
|
||||
prop_data["state"] = address_data.state
|
||||
prop_data["zip_code"] = address_data.zip_code
|
||||
|
||||
del prop_data["address"]
|
||||
|
||||
if "agent" in prop_data and prop_data["agent"] is not None:
|
||||
agent_data = prop_data["agent"]
|
||||
prop_data["agent_name"] = agent_data.name
|
||||
prop_data["agent_phone"] = agent_data.phone
|
||||
prop_data["agent_email"] = agent_data.email
|
||||
|
||||
del prop_data["agent"]
|
||||
else:
|
||||
prop_data["agent_name"] = None
|
||||
prop_data["agent_phone"] = None
|
||||
prop_data["agent_email"] = None
|
||||
|
||||
properties_df = pd.DataFrame([prop_data])
|
||||
properties_df = properties_df[_get_ordered_properties(result)]
|
||||
|
||||
return properties_df
|
||||
|
||||
|
||||
def _scrape_single_site(location: str, site_name: str, listing_type: str, radius: float, proxy: str = None, sold_last_x_days: int = None) -> pd.DataFrame:
|
||||
"""
|
||||
Helper function to scrape a single site.
|
||||
|
@ -124,22 +41,20 @@ def _scrape_single_site(location: str, site_name: str, listing_type: str, radius
|
|||
site = _scrapers[site_name.lower()](scraper_input)
|
||||
results = site.search()
|
||||
|
||||
properties_dfs = [_process_result(result) for result in results]
|
||||
properties_dfs = [df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty]
|
||||
properties_dfs = [process_result(result) for result in results]
|
||||
if not properties_dfs:
|
||||
return pd.DataFrame()
|
||||
|
||||
return pd.concat(properties_dfs, ignore_index=True)
|
||||
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
|
||||
|
||||
|
||||
def scrape_property(
|
||||
location: str,
|
||||
site_name: Union[str, list[str]] = "realtor.com",
|
||||
#: site_name: Union[str, list[str]] = "realtor.com",
|
||||
listing_type: str = "for_sale",
|
||||
radius: float = None,
|
||||
sold_last_x_days: int = None,
|
||||
proxy: str = None,
|
||||
keep_duplicates: bool = False
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Scrape property from various sites from a given location and listing type.
|
||||
|
@ -153,6 +68,7 @@ def scrape_property(
|
|||
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
|
||||
:returns: pd.DataFrame containing properties
|
||||
"""
|
||||
site_name = "realtor.com"
|
||||
|
||||
if site_name is None:
|
||||
site_name = list(_scrapers.keys())
|
||||
|
@ -183,13 +99,11 @@ def scrape_property(
|
|||
|
||||
final_df = pd.concat(results, ignore_index=True)
|
||||
|
||||
columns_to_track = ["address_one", "address_two", "city"]
|
||||
columns_to_track = ["Street", "Unit", "Zip"]
|
||||
|
||||
#: validate they exist, otherwise create them
|
||||
for col in columns_to_track:
|
||||
if col not in final_df.columns:
|
||||
final_df[col] = None
|
||||
|
||||
if not keep_duplicates:
|
||||
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
|
||||
return final_df
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Tuple
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class SiteName(Enum):
|
||||
|
@ -23,46 +22,13 @@ class ListingType(Enum):
|
|||
SOLD = "SOLD"
|
||||
|
||||
|
||||
class PropertyType(Enum):
|
||||
HOUSE = "HOUSE"
|
||||
BUILDING = "BUILDING"
|
||||
CONDO = "CONDO"
|
||||
TOWNHOUSE = "TOWNHOUSE"
|
||||
SINGLE_FAMILY = "SINGLE_FAMILY"
|
||||
MULTI_FAMILY = "MULTI_FAMILY"
|
||||
MANUFACTURED = "MANUFACTURED"
|
||||
NEW_CONSTRUCTION = "NEW_CONSTRUCTION"
|
||||
APARTMENT = "APARTMENT"
|
||||
APARTMENTS = "APARTMENTS"
|
||||
LAND = "LAND"
|
||||
LOT = "LOT"
|
||||
OTHER = "OTHER"
|
||||
|
||||
BLANK = "BLANK"
|
||||
|
||||
@classmethod
|
||||
def from_int_code(cls, code):
|
||||
mapping = {
|
||||
1: cls.HOUSE,
|
||||
2: cls.CONDO,
|
||||
3: cls.TOWNHOUSE,
|
||||
4: cls.MULTI_FAMILY,
|
||||
5: cls.LAND,
|
||||
6: cls.OTHER,
|
||||
8: cls.SINGLE_FAMILY,
|
||||
13: cls.SINGLE_FAMILY,
|
||||
}
|
||||
|
||||
return mapping.get(code, cls.BLANK)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Address:
|
||||
address_one: str | None = None
|
||||
address_two: str | None = "#"
|
||||
street: str | None = None
|
||||
unit: str | None = None
|
||||
city: str | None = None
|
||||
state: str | None = None
|
||||
zip_code: str | None = None
|
||||
zip: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -74,47 +40,31 @@ class Agent:
|
|||
|
||||
@dataclass
|
||||
class Property:
|
||||
property_url: str
|
||||
site_name: SiteName
|
||||
listing_type: ListingType
|
||||
address: Address
|
||||
property_type: PropertyType | None = None
|
||||
|
||||
# house for sale
|
||||
tax_assessed_value: int | None = None
|
||||
lot_area_value: float | None = None
|
||||
lot_area_unit: str | None = None
|
||||
stories: int | None = None
|
||||
year_built: int | None = None
|
||||
price_per_sqft: int | None = None
|
||||
property_url: str | None = None
|
||||
mls: str | None = None
|
||||
mls_id: str | None = None
|
||||
status: str | None = None
|
||||
style: str | None = None
|
||||
|
||||
agent: Agent | None = None
|
||||
img_src: str | None = None
|
||||
description: str | None = None
|
||||
status_text: str | None = None
|
||||
posted_time: datetime | None = None
|
||||
beds: int | None = None
|
||||
baths_full: int | None = None
|
||||
baths_half: int | None = None
|
||||
list_price: int | None = None
|
||||
list_date: str | None = None
|
||||
sold_price: int | None = None
|
||||
last_sold_date: str | None = None
|
||||
prc_sqft: float | None = None
|
||||
est_sf: int | None = None
|
||||
lot_sf: int | None = None
|
||||
hoa_fee: int | None = None
|
||||
|
||||
# building for sale
|
||||
bldg_name: str | None = None
|
||||
area_min: int | None = None
|
||||
|
||||
beds_min: int | None = None
|
||||
beds_max: int | None = None
|
||||
|
||||
baths_min: float | None = None
|
||||
baths_max: float | None = None
|
||||
|
||||
sqft_min: int | None = None
|
||||
sqft_max: int | None = None
|
||||
|
||||
price_min: int | None = None
|
||||
price_max: int | None = None
|
||||
|
||||
unit_count: int | None = None
|
||||
address: Address | None = None
|
||||
|
||||
yr_blt: int | None = None
|
||||
latitude: float | None = None
|
||||
longitude: float | None = None
|
||||
|
||||
sold_date: datetime | None = None
|
||||
days_on_market: int | None = None
|
||||
stories: int | None = None
|
||||
prkg_gar: float | None = None
|
||||
|
||||
neighborhoods: Optional[str] = None
|
||||
|
|
|
@ -7,7 +7,6 @@ This module implements the scraper for relator.com
|
|||
from ..models import Property, Address, ListingType
|
||||
from .. import Scraper
|
||||
from ....exceptions import NoResultsFound
|
||||
from ....utils import parse_address_one, parse_address_two
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
|
||||
|
@ -118,91 +117,105 @@ class RealtorScraper(Scraper):
|
|||
response_json = response.json()
|
||||
|
||||
property_info = response_json["data"]["property"]
|
||||
address_one, address_two = parse_address_one(property_info["address"]["line"])
|
||||
|
||||
return [
|
||||
Property(
|
||||
site_name=self.site_name,
|
||||
address=Address(
|
||||
address_one=address_one,
|
||||
address_two=address_two,
|
||||
city=property_info["address"]["city"],
|
||||
state=property_info["address"]["state_code"],
|
||||
zip_code=property_info["address"]["postal_code"],
|
||||
),
|
||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||
+ property_info["details"]["permalink"],
|
||||
+ property_info["details"]["permalink"],
|
||||
stories=property_info["details"]["stories"],
|
||||
year_built=property_info["details"]["year_built"],
|
||||
price_per_sqft=property_info["basic"]["price"] // property_info["basic"]["sqft"]
|
||||
if property_info["basic"]["sqft"] is not None and property_info["basic"]["price"] is not None
|
||||
else None,
|
||||
mls_id=property_id,
|
||||
listing_type=self.listing_type,
|
||||
lot_area_value=property_info["public_record"]["lot_size"]
|
||||
if property_info["public_record"] is not None
|
||||
else None,
|
||||
beds_min=property_info["basic"]["beds"],
|
||||
beds_max=property_info["basic"]["beds"],
|
||||
baths_min=property_info["basic"]["baths"],
|
||||
baths_max=property_info["basic"]["baths"],
|
||||
sqft_min=property_info["basic"]["sqft"],
|
||||
sqft_max=property_info["basic"]["sqft"],
|
||||
price_min=property_info["basic"]["price"],
|
||||
price_max=property_info["basic"]["price"],
|
||||
)
|
||||
]
|
||||
|
||||
def handle_area(self, variables: dict, is_for_comps: bool = False, return_total: bool = False) -> list[Property] | int:
|
||||
def handle_area(self, variables: dict, is_for_comps: bool = False, return_total: bool = False) -> list[
|
||||
Property] | int:
|
||||
"""
|
||||
Handles a location area & returns a list of properties
|
||||
"""
|
||||
|
||||
results_query = """{
|
||||
count
|
||||
total
|
||||
results {
|
||||
property_id
|
||||
description {
|
||||
baths
|
||||
beds
|
||||
lot_sqft
|
||||
sqft
|
||||
text
|
||||
sold_price
|
||||
stories
|
||||
year_built
|
||||
garage
|
||||
unit_number
|
||||
floor_number
|
||||
}
|
||||
location {
|
||||
address {
|
||||
city
|
||||
country
|
||||
line
|
||||
postal_code
|
||||
state_code
|
||||
state
|
||||
street_direction
|
||||
street_name
|
||||
street_number
|
||||
street_post_direction
|
||||
street_suffix
|
||||
unit
|
||||
coordinate {
|
||||
lon
|
||||
lat
|
||||
count
|
||||
total
|
||||
results {
|
||||
property_id
|
||||
list_date
|
||||
status
|
||||
last_sold_price
|
||||
last_sold_date
|
||||
hoa {
|
||||
fee
|
||||
}
|
||||
description {
|
||||
baths_full
|
||||
baths_half
|
||||
beds
|
||||
lot_sqft
|
||||
sqft
|
||||
sold_price
|
||||
year_built
|
||||
garage
|
||||
sold_price
|
||||
type
|
||||
sub_type
|
||||
name
|
||||
stories
|
||||
}
|
||||
source {
|
||||
raw {
|
||||
area
|
||||
status
|
||||
style
|
||||
}
|
||||
last_update_date
|
||||
contract_date
|
||||
id
|
||||
listing_id
|
||||
name
|
||||
type
|
||||
listing_href
|
||||
community_id
|
||||
management_id
|
||||
corporation_id
|
||||
subdivision_status
|
||||
spec_id
|
||||
plan_id
|
||||
tier_rank
|
||||
feed_type
|
||||
}
|
||||
location {
|
||||
address {
|
||||
city
|
||||
country
|
||||
line
|
||||
postal_code
|
||||
state_code
|
||||
state
|
||||
coordinate {
|
||||
lon
|
||||
lat
|
||||
}
|
||||
street_direction
|
||||
street_name
|
||||
street_number
|
||||
street_post_direction
|
||||
street_suffix
|
||||
unit
|
||||
}
|
||||
neighborhoods {
|
||||
name
|
||||
}
|
||||
}
|
||||
list_price
|
||||
price_per_sqft
|
||||
style_category_tags {
|
||||
exterior
|
||||
}
|
||||
source {
|
||||
id
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
list_price
|
||||
price_per_sqft
|
||||
source {
|
||||
id
|
||||
}
|
||||
}
|
||||
}}"""
|
||||
}"""
|
||||
|
||||
sold_date_param = ('sold_date: { min: "$today-%sD" }' % self.sold_last_x_days
|
||||
if self.listing_type == ListingType.SOLD and self.sold_last_x_days is not None
|
||||
|
@ -210,7 +223,7 @@ class RealtorScraper(Scraper):
|
|||
|
||||
if not is_for_comps:
|
||||
query = (
|
||||
"""query Home_search(
|
||||
"""query Home_search(
|
||||
$city: String,
|
||||
$county: [String],
|
||||
$state_code: String,
|
||||
|
@ -229,15 +242,15 @@ class RealtorScraper(Scraper):
|
|||
limit: 200
|
||||
offset: $offset
|
||||
) %s"""
|
||||
% (
|
||||
self.listing_type.value.lower(),
|
||||
sold_date_param,
|
||||
results_query
|
||||
)
|
||||
% (
|
||||
self.listing_type.value.lower(),
|
||||
sold_date_param,
|
||||
results_query
|
||||
)
|
||||
)
|
||||
else:
|
||||
query = (
|
||||
"""query Property_search(
|
||||
"""query Property_search(
|
||||
$coordinates: [Float]!
|
||||
$radius: String!
|
||||
$offset: Int!,
|
||||
|
@ -270,56 +283,80 @@ class RealtorScraper(Scraper):
|
|||
properties: list[Property] = []
|
||||
|
||||
if (
|
||||
response_json is None
|
||||
or "data" not in response_json
|
||||
or response_json["data"] is None
|
||||
or search_key not in response_json["data"]
|
||||
or response_json["data"][search_key] is None
|
||||
or "results" not in response_json["data"][search_key]
|
||||
response_json is None
|
||||
or "data" not in response_json
|
||||
or response_json["data"] is None
|
||||
or search_key not in response_json["data"]
|
||||
or response_json["data"][search_key] is None
|
||||
or "results" not in response_json["data"][search_key]
|
||||
):
|
||||
return []
|
||||
|
||||
for result in response_json["data"][search_key]["results"]:
|
||||
self.counter += 1
|
||||
address_one, _ = parse_address_one(result["location"]["address"]["line"])
|
||||
mls = (
|
||||
result["source"].get("id")
|
||||
if "source" in result and isinstance(result["source"], dict)
|
||||
else None
|
||||
)
|
||||
mls_id = (
|
||||
result["source"].get("listing_id")
|
||||
if "source" in result and isinstance(result["source"], dict)
|
||||
else None
|
||||
)
|
||||
|
||||
if not mls_id:
|
||||
continue
|
||||
# not type
|
||||
|
||||
neighborhoods_list = []
|
||||
neighborhoods = result["location"].get("neighborhoods", [])
|
||||
|
||||
if neighborhoods:
|
||||
for neighborhood in neighborhoods:
|
||||
name = neighborhood.get("name")
|
||||
if name:
|
||||
neighborhoods_list.append(name)
|
||||
|
||||
neighborhoods_str = (
|
||||
", ".join(neighborhoods_list) if neighborhoods_list else None
|
||||
)
|
||||
|
||||
able_to_get_lat_long = result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate")
|
||||
|
||||
realty_property = Property(
|
||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||
+ result["property_id"],
|
||||
mls=mls,
|
||||
mls_id=mls_id,
|
||||
status=result["status"].upper(),
|
||||
style=result["description"]["type"].upper(),
|
||||
beds=result["description"]["beds"],
|
||||
baths_full=result["description"]["baths_full"],
|
||||
baths_half=result["description"]["baths_half"],
|
||||
est_sf=result["description"]["sqft"],
|
||||
lot_sf=result["description"]["lot_sqft"],
|
||||
list_price=result["list_price"],
|
||||
list_date=result["list_date"].split("T")[0]
|
||||
if result["list_date"]
|
||||
else None,
|
||||
sold_price=result["description"]["sold_price"],
|
||||
prc_sqft=result["price_per_sqft"],
|
||||
last_sold_date=result["last_sold_date"],
|
||||
hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None,
|
||||
address=Address(
|
||||
address_one=address_one,
|
||||
street=f"{result['location']['address']['street_number']} {result['location']['address']['street_name']} {result['location']['address']['street_suffix']}",
|
||||
unit=result["location"]["address"]["unit"],
|
||||
city=result["location"]["address"]["city"],
|
||||
state=result["location"]["address"]["state_code"],
|
||||
zip_code=result["location"]["address"]["postal_code"],
|
||||
address_two=parse_address_two(result["location"]["address"]["unit"]),
|
||||
zip=result["location"]["address"]["postal_code"],
|
||||
),
|
||||
latitude=result["location"]["address"]["coordinate"]["lat"]
|
||||
if result
|
||||
and result.get("location")
|
||||
and result["location"].get("address")
|
||||
and result["location"]["address"].get("coordinate")
|
||||
and "lat" in result["location"]["address"]["coordinate"]
|
||||
else None,
|
||||
longitude=result["location"]["address"]["coordinate"]["lon"]
|
||||
if result
|
||||
and result.get("location")
|
||||
and result["location"].get("address")
|
||||
and result["location"]["address"].get("coordinate")
|
||||
and "lon" in result["location"]["address"]["coordinate"]
|
||||
else None,
|
||||
site_name=self.site_name,
|
||||
property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"],
|
||||
yr_blt=result["description"]["year_built"],
|
||||
latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None,
|
||||
longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None,
|
||||
prkg_gar=result["description"]["garage"],
|
||||
stories=result["description"]["stories"],
|
||||
year_built=result["description"]["year_built"],
|
||||
price_per_sqft=result["price_per_sqft"],
|
||||
mls_id=result["property_id"],
|
||||
listing_type=self.listing_type,
|
||||
lot_area_value=result["description"]["lot_sqft"],
|
||||
beds_min=result["description"]["beds"],
|
||||
beds_max=result["description"]["beds"],
|
||||
baths_min=result["description"]["baths"],
|
||||
baths_max=result["description"]["baths"],
|
||||
sqft_min=result["description"]["sqft"],
|
||||
sqft_max=result["description"]["sqft"],
|
||||
price_min=result["list_price"],
|
||||
price_max=result["list_price"],
|
||||
neighborhoods=neighborhoods_str,
|
||||
)
|
||||
properties.append(realty_property)
|
||||
|
||||
|
|
|
@ -1,246 +0,0 @@
|
|||
"""
|
||||
homeharvest.redfin.__init__
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This module implements the scraper for redfin.com
|
||||
"""
|
||||
import json
|
||||
from typing import Any
|
||||
from .. import Scraper
|
||||
from ....utils import parse_address_two, parse_address_one
|
||||
from ..models import Property, Address, PropertyType, ListingType, SiteName, Agent
|
||||
from ....exceptions import NoResultsFound, SearchTooBroad
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class RedfinScraper(Scraper):
|
||||
def __init__(self, scraper_input):
|
||||
super().__init__(scraper_input)
|
||||
self.listing_type = scraper_input.listing_type
|
||||
|
||||
def _handle_location(self):
|
||||
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(self.location)
|
||||
|
||||
response = self.session.get(url)
|
||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||
|
||||
def get_region_type(match_type: str):
|
||||
if match_type == "4":
|
||||
return "2" #: zip
|
||||
elif match_type == "2":
|
||||
return "6" #: city
|
||||
elif match_type == "1":
|
||||
return "address" #: address, needs to be handled differently
|
||||
elif match_type == "11":
|
||||
return "state"
|
||||
|
||||
if "exactMatch" not in response_json["payload"]:
|
||||
raise NoResultsFound("No results found for location: {}".format(self.location))
|
||||
|
||||
if response_json["payload"]["exactMatch"] is not None:
|
||||
target = response_json["payload"]["exactMatch"]
|
||||
else:
|
||||
target = response_json["payload"]["sections"][0]["rows"][0]
|
||||
|
||||
return target["id"].split("_")[1], get_region_type(target["type"])
|
||||
|
||||
def _parse_home(self, home: dict, single_search: bool = False) -> Property:
|
||||
def get_value(key: str) -> Any | None:
|
||||
if key in home and "value" in home[key]:
|
||||
return home[key]["value"]
|
||||
|
||||
if not single_search:
|
||||
address = Address(
|
||||
address_one=parse_address_one(get_value("streetLine"))[0],
|
||||
address_two=parse_address_one(get_value("streetLine"))[1],
|
||||
city=home.get("city"),
|
||||
state=home.get("state"),
|
||||
zip_code=home.get("zip"),
|
||||
)
|
||||
else:
|
||||
address_info = home.get("streetAddress")
|
||||
address_one, address_two = parse_address_one(address_info.get("assembledAddress"))
|
||||
|
||||
address = Address(
|
||||
address_one=address_one,
|
||||
address_two=address_two,
|
||||
city=home.get("city"),
|
||||
state=home.get("state"),
|
||||
zip_code=home.get("zip"),
|
||||
)
|
||||
|
||||
url = "https://www.redfin.com{}".format(home["url"])
|
||||
lot_size_data = home.get("lotSize")
|
||||
|
||||
if not isinstance(lot_size_data, int):
|
||||
lot_size = lot_size_data.get("value", None) if isinstance(lot_size_data, dict) else None
|
||||
else:
|
||||
lot_size = lot_size_data
|
||||
|
||||
lat_long = get_value("latLong")
|
||||
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
listing_type=self.listing_type,
|
||||
address=address,
|
||||
property_url=url,
|
||||
beds_min=home["beds"] if "beds" in home else None,
|
||||
beds_max=home["beds"] if "beds" in home else None,
|
||||
baths_min=home["baths"] if "baths" in home else None,
|
||||
baths_max=home["baths"] if "baths" in home else None,
|
||||
price_min=get_value("price"),
|
||||
price_max=get_value("price"),
|
||||
sqft_min=get_value("sqFt"),
|
||||
sqft_max=get_value("sqFt"),
|
||||
stories=home["stories"] if "stories" in home else None,
|
||||
agent=Agent( #: listingAgent, some have sellingAgent as well
|
||||
name=home['listingAgent'].get('name') if 'listingAgent' in home else None,
|
||||
phone=home['listingAgent'].get('phone') if 'listingAgent' in home else None,
|
||||
),
|
||||
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
||||
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
|
||||
lot_area_value=lot_size,
|
||||
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
||||
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
|
||||
mls_id=get_value("mlsId"),
|
||||
latitude=lat_long.get('latitude') if lat_long else None,
|
||||
longitude=lat_long.get('longitude') if lat_long else None,
|
||||
sold_date=datetime.fromtimestamp(home['soldDate'] / 1000) if 'soldDate' in home else None,
|
||||
days_on_market=get_value("dom")
|
||||
)
|
||||
|
||||
def _handle_rentals(self, region_id, region_type):
|
||||
url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true®ion_id={region_id}®ion_type={region_type}&num_homes=100000"
|
||||
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
homes = response.json()
|
||||
|
||||
properties_list = []
|
||||
|
||||
for home in homes["homes"]:
|
||||
home_data = home["homeData"]
|
||||
rental_data = home["rentalExtension"]
|
||||
|
||||
property_url = f"https://www.redfin.com{home_data.get('url', '')}"
|
||||
address_info = home_data.get("addressInfo", {})
|
||||
centroid = address_info.get("centroid", {}).get("centroid", {})
|
||||
address = Address(
|
||||
address_one=parse_address_one(address_info.get("formattedStreetLine"))[0],
|
||||
city=address_info.get("city"),
|
||||
state=address_info.get("state"),
|
||||
zip_code=address_info.get("zip"),
|
||||
)
|
||||
|
||||
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
|
||||
bed_range = rental_data.get("bedRange", {"min": None, "max": None})
|
||||
bath_range = rental_data.get("bathRange", {"min": None, "max": None})
|
||||
sqft_range = rental_data.get("sqftRange", {"min": None, "max": None})
|
||||
|
||||
property_ = Property(
|
||||
property_url=property_url,
|
||||
site_name=SiteName.REDFIN,
|
||||
listing_type=ListingType.FOR_RENT,
|
||||
address=address,
|
||||
description=rental_data.get("description"),
|
||||
latitude=centroid.get("latitude"),
|
||||
longitude=centroid.get("longitude"),
|
||||
baths_min=bath_range.get("min"),
|
||||
baths_max=bath_range.get("max"),
|
||||
beds_min=bed_range.get("min"),
|
||||
beds_max=bed_range.get("max"),
|
||||
price_min=price_range.get("min"),
|
||||
price_max=price_range.get("max"),
|
||||
sqft_min=sqft_range.get("min"),
|
||||
sqft_max=sqft_range.get("max"),
|
||||
img_src=home_data.get("staticMapUrl"),
|
||||
posted_time=rental_data.get("lastUpdated"),
|
||||
bldg_name=rental_data.get("propertyName"),
|
||||
)
|
||||
|
||||
properties_list.append(property_)
|
||||
|
||||
if not properties_list:
|
||||
raise NoResultsFound("No rentals found for the given location.")
|
||||
|
||||
return properties_list
|
||||
|
||||
def _parse_building(self, building: dict) -> Property:
|
||||
street_address = " ".join(
|
||||
[
|
||||
building["address"]["streetNumber"],
|
||||
building["address"]["directionalPrefix"],
|
||||
building["address"]["streetName"],
|
||||
building["address"]["streetType"],
|
||||
]
|
||||
)
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
property_type=PropertyType("BUILDING"),
|
||||
address=Address(
|
||||
address_one=parse_address_one(street_address)[0],
|
||||
city=building["address"]["city"],
|
||||
state=building["address"]["stateOrProvinceCode"],
|
||||
zip_code=building["address"]["postalCode"],
|
||||
address_two=parse_address_two(
|
||||
" ".join(
|
||||
[
|
||||
building["address"]["unitType"],
|
||||
building["address"]["unitValue"],
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||
listing_type=self.listing_type,
|
||||
unit_count=building.get("numUnitsForSale"),
|
||||
)
|
||||
|
||||
def handle_address(self, home_id: str):
|
||||
"""
|
||||
EPs:
|
||||
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
|
||||
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
|
||||
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
|
||||
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
|
||||
"""
|
||||
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
|
||||
home_id
|
||||
)
|
||||
|
||||
response = self.session.get(url)
|
||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||
|
||||
parsed_home = self._parse_home(response_json["payload"]["addressSectionInfo"], single_search=True)
|
||||
return [parsed_home]
|
||||
|
||||
def search(self):
|
||||
region_id, region_type = self._handle_location()
|
||||
|
||||
if region_type == "state":
|
||||
raise SearchTooBroad("State searches are not supported, please use a more specific location.")
|
||||
|
||||
if region_type == "address":
|
||||
home_id = region_id
|
||||
return self.handle_address(home_id)
|
||||
|
||||
if self.listing_type == ListingType.FOR_RENT:
|
||||
return self._handle_rentals(region_id, region_type)
|
||||
else:
|
||||
if self.listing_type == ListingType.FOR_SALE:
|
||||
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&num_homes=100000"
|
||||
else:
|
||||
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000"
|
||||
response = self.session.get(url)
|
||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||
|
||||
if "payload" in response_json:
|
||||
homes_list = response_json["payload"].get("homes", [])
|
||||
buildings_list = response_json["payload"].get("buildings", {}).values()
|
||||
|
||||
homes = [self._parse_home(home) for home in homes_list] + [
|
||||
self._parse_building(building) for building in buildings_list
|
||||
]
|
||||
return homes
|
||||
else:
|
||||
return []
|
|
@ -1,335 +0,0 @@
|
|||
"""
|
||||
homeharvest.zillow.__init__
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This module implements the scraper for zillow.com
|
||||
"""
|
||||
import re
|
||||
import json
|
||||
|
||||
import tls_client
|
||||
|
||||
from .. import Scraper
|
||||
from requests.exceptions import HTTPError
|
||||
from ....utils import parse_address_one, parse_address_two
|
||||
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||
from ..models import Property, Address, ListingType, PropertyType, Agent
|
||||
import urllib.parse
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
class ZillowScraper(Scraper):
|
||||
def __init__(self, scraper_input):
|
||||
session = tls_client.Session(
|
||||
client_identifier="chrome112", random_tls_extension_order=True
|
||||
)
|
||||
|
||||
super().__init__(scraper_input, session)
|
||||
|
||||
self.session.headers.update({
|
||||
'authority': 'www.zillow.com',
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'cache-control': 'max-age=0',
|
||||
'sec-fetch-dest': 'document',
|
||||
'sec-fetch-mode': 'navigate',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'sec-fetch-user': '?1',
|
||||
'upgrade-insecure-requests': '1',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
|
||||
})
|
||||
|
||||
if not self.is_plausible_location(self.location):
|
||||
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
||||
|
||||
listing_type_to_url_path = {
|
||||
ListingType.FOR_SALE: "for_sale",
|
||||
ListingType.FOR_RENT: "for_rent",
|
||||
ListingType.SOLD: "recently_sold",
|
||||
}
|
||||
|
||||
self.url = f"https://www.zillow.com/homes/{listing_type_to_url_path[self.listing_type]}/{self.location}_rb/"
|
||||
|
||||
def is_plausible_location(self, location: str) -> bool:
|
||||
url = (
|
||||
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
|
||||
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
|
||||
).format(urllib.parse.quote(location))
|
||||
|
||||
resp = self.session.get(url)
|
||||
|
||||
return resp.json()["results"] != []
|
||||
|
||||
def search(self):
|
||||
resp = self.session.get(self.url)
|
||||
if resp.status_code != 200:
|
||||
raise HTTPError(
|
||||
f"bad response status code: {resp.status_code}"
|
||||
)
|
||||
content = resp.text
|
||||
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not match:
|
||||
raise NoResultsFound("No results were found for Zillow with the given Location.")
|
||||
|
||||
json_str = match.group(1)
|
||||
data = json.loads(json_str)
|
||||
|
||||
if "searchPageState" in data["props"]["pageProps"]:
|
||||
pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};'
|
||||
|
||||
match = re.search(pattern, content)
|
||||
|
||||
if match:
|
||||
coords = [float(coord) for coord in match.groups()]
|
||||
return self._fetch_properties_backend(coords)
|
||||
|
||||
else:
|
||||
raise GeoCoordsNotFound("Box bounds could not be located.")
|
||||
|
||||
elif "gdpClientCache" in data["props"]["pageProps"]:
|
||||
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
|
||||
main_key = list(gdp_client_cache.keys())[0]
|
||||
|
||||
property_data = gdp_client_cache[main_key]["property"]
|
||||
property = self._get_single_property_page(property_data)
|
||||
|
||||
return [property]
|
||||
raise NoResultsFound("Specific property data not found in the response.")
|
||||
|
||||
def _fetch_properties_backend(self, coords):
|
||||
url = "https://www.zillow.com/async-create-search-page-state"
|
||||
|
||||
filter_state_for_sale = {
|
||||
"sortSelection": {
|
||||
# "value": "globalrelevanceex"
|
||||
"value": "days"
|
||||
},
|
||||
"isAllHomes": {"value": True},
|
||||
}
|
||||
|
||||
filter_state_for_rent = {
|
||||
"isForRent": {"value": True},
|
||||
"isForSaleByAgent": {"value": False},
|
||||
"isForSaleByOwner": {"value": False},
|
||||
"isNewConstruction": {"value": False},
|
||||
"isComingSoon": {"value": False},
|
||||
"isAuction": {"value": False},
|
||||
"isForSaleForeclosure": {"value": False},
|
||||
"isAllHomes": {"value": True},
|
||||
}
|
||||
|
||||
filter_state_sold = {
|
||||
"isRecentlySold": {"value": True},
|
||||
"isForSaleByAgent": {"value": False},
|
||||
"isForSaleByOwner": {"value": False},
|
||||
"isNewConstruction": {"value": False},
|
||||
"isComingSoon": {"value": False},
|
||||
"isAuction": {"value": False},
|
||||
"isForSaleForeclosure": {"value": False},
|
||||
"isAllHomes": {"value": True},
|
||||
}
|
||||
|
||||
selected_filter = (
|
||||
filter_state_for_rent
|
||||
if self.listing_type == ListingType.FOR_RENT
|
||||
else filter_state_for_sale
|
||||
if self.listing_type == ListingType.FOR_SALE
|
||||
else filter_state_sold
|
||||
)
|
||||
|
||||
payload = {
|
||||
"searchQueryState": {
|
||||
"pagination": {},
|
||||
"isMapVisible": True,
|
||||
"mapBounds": {
|
||||
"west": coords[0],
|
||||
"east": coords[1],
|
||||
"south": coords[2],
|
||||
"north": coords[3],
|
||||
},
|
||||
"filterState": selected_filter,
|
||||
"isListVisible": True,
|
||||
"mapZoom": 11,
|
||||
},
|
||||
"wants": {"cat1": ["mapResults"]},
|
||||
"isDebugRequest": False,
|
||||
}
|
||||
resp = self.session.put(url, json=payload)
|
||||
if resp.status_code != 200:
|
||||
raise HTTPError(
|
||||
f"bad response status code: {resp.status_code}"
|
||||
)
|
||||
return self._parse_properties(resp.json())
|
||||
|
||||
@staticmethod
|
||||
def parse_posted_time(time: str) -> datetime:
|
||||
int_time = int(time.split(" ")[0])
|
||||
|
||||
if "hour" in time:
|
||||
return datetime.now() - timedelta(hours=int_time)
|
||||
|
||||
if "day" in time:
|
||||
return datetime.now() - timedelta(days=int_time)
|
||||
|
||||
def _parse_properties(self, property_data: dict):
|
||||
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
|
||||
|
||||
properties_list = []
|
||||
|
||||
for result in mapresults:
|
||||
if "hdpData" in result:
|
||||
home_info = result["hdpData"]["homeInfo"]
|
||||
address_data = {
|
||||
"address_one": parse_address_one(home_info.get("streetAddress"))[0],
|
||||
"address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#",
|
||||
"city": home_info.get("city"),
|
||||
"state": home_info.get("state"),
|
||||
"zip_code": home_info.get("zipcode"),
|
||||
}
|
||||
property_obj = Property(
|
||||
site_name=self.site_name,
|
||||
address=Address(**address_data),
|
||||
property_url=f"https://www.zillow.com{result['detailUrl']}",
|
||||
tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None,
|
||||
property_type=PropertyType(home_info.get("homeType")),
|
||||
listing_type=ListingType(
|
||||
home_info["statusType"] if "statusType" in home_info else self.listing_type
|
||||
),
|
||||
status_text=result.get("statusText"),
|
||||
posted_time=self.parse_posted_time(result["variableData"]["text"])
|
||||
if "variableData" in result
|
||||
and "text" in result["variableData"]
|
||||
and result["variableData"]["type"] == "TIME_ON_INFO"
|
||||
else None,
|
||||
price_min=home_info.get("price"),
|
||||
price_max=home_info.get("price"),
|
||||
beds_min=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
|
||||
beds_max=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
|
||||
baths_min=home_info.get("bathrooms"),
|
||||
baths_max=home_info.get("bathrooms"),
|
||||
sqft_min=int(home_info["livingArea"]) if "livingArea" in home_info else None,
|
||||
sqft_max=int(home_info["livingArea"]) if "livingArea" in home_info else None,
|
||||
price_per_sqft=int(home_info["price"] // home_info["livingArea"])
|
||||
if "livingArea" in home_info and home_info["livingArea"] != 0 and "price" in home_info
|
||||
else None,
|
||||
latitude=result["latLong"]["latitude"],
|
||||
longitude=result["latLong"]["longitude"],
|
||||
lot_area_value=round(home_info["lotAreaValue"], 2) if "lotAreaValue" in home_info else None,
|
||||
lot_area_unit=home_info.get("lotAreaUnit"),
|
||||
img_src=result.get("imgSrc"),
|
||||
)
|
||||
|
||||
properties_list.append(property_obj)
|
||||
|
||||
elif "isBuilding" in result:
|
||||
price_string = result["price"].replace("$", "").replace(",", "").replace("+/mo", "")
|
||||
|
||||
match = re.search(r"(\d+)", price_string)
|
||||
price_value = int(match.group(1)) if match else None
|
||||
building_obj = Property(
|
||||
property_url=f"https://www.zillow.com{result['detailUrl']}",
|
||||
site_name=self.site_name,
|
||||
property_type=PropertyType("BUILDING"),
|
||||
listing_type=ListingType(result["statusType"]),
|
||||
img_src=result.get("imgSrc"),
|
||||
address=self._extract_address(result["address"]),
|
||||
baths_min=result.get("minBaths"),
|
||||
area_min=result.get("minArea"),
|
||||
bldg_name=result.get("communityName"),
|
||||
status_text=result.get("statusText"),
|
||||
price_min=price_value if "+/mo" in result.get("price") else None,
|
||||
price_max=price_value if "+/mo" in result.get("price") else None,
|
||||
latitude=result.get("latLong", {}).get("latitude"),
|
||||
longitude=result.get("latLong", {}).get("longitude"),
|
||||
unit_count=result.get("unitCount"),
|
||||
)
|
||||
|
||||
properties_list.append(building_obj)
|
||||
|
||||
return properties_list
|
||||
|
||||
def _get_single_property_page(self, property_data: dict):
|
||||
"""
|
||||
This method is used when a user enters the exact location & zillow returns just one property
|
||||
"""
|
||||
url = (
|
||||
f"https://www.zillow.com{property_data['hdpUrl']}"
|
||||
if "zillow.com" not in property_data["hdpUrl"]
|
||||
else property_data["hdpUrl"]
|
||||
)
|
||||
address_data = property_data["address"]
|
||||
address_one, address_two = parse_address_one(address_data["streetAddress"])
|
||||
address = Address(
|
||||
address_one=address_one,
|
||||
address_two=address_two if address_two else "#",
|
||||
city=address_data["city"],
|
||||
state=address_data["state"],
|
||||
zip_code=address_data["zipcode"],
|
||||
)
|
||||
property_type = property_data.get("homeType", None)
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
property_url=url,
|
||||
property_type=PropertyType(property_type) if property_type in PropertyType.__members__ else None,
|
||||
listing_type=self.listing_type,
|
||||
address=address,
|
||||
year_built=property_data.get("yearBuilt"),
|
||||
tax_assessed_value=property_data.get("taxAssessedValue"),
|
||||
lot_area_value=property_data.get("lotAreaValue"),
|
||||
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
|
||||
agent=Agent(
|
||||
name=property_data.get("attributionInfo", {}).get("agentName")
|
||||
),
|
||||
stories=property_data.get("resoFacts", {}).get("stories"),
|
||||
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
|
||||
beds_min=property_data.get("bedrooms"),
|
||||
beds_max=property_data.get("bedrooms"),
|
||||
baths_min=property_data.get("bathrooms"),
|
||||
baths_max=property_data.get("bathrooms"),
|
||||
price_min=property_data.get("price"),
|
||||
price_max=property_data.get("price"),
|
||||
sqft_min=property_data.get("livingArea"),
|
||||
sqft_max=property_data.get("livingArea"),
|
||||
price_per_sqft=property_data.get("resoFacts", {}).get("pricePerSquareFoot"),
|
||||
latitude=property_data.get("latitude"),
|
||||
longitude=property_data.get("longitude"),
|
||||
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
|
||||
description=property_data.get("description"),
|
||||
)
|
||||
|
||||
def _extract_address(self, address_str):
|
||||
"""
|
||||
Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
|
||||
and return an Address object.
|
||||
"""
|
||||
parts = address_str.split(", ")
|
||||
|
||||
if len(parts) != 3:
|
||||
raise ValueError(f"Unexpected address format: {address_str}")
|
||||
|
||||
address_one = parts[0].strip()
|
||||
city = parts[1].strip()
|
||||
state_zip = parts[2].split(" ")
|
||||
|
||||
if len(state_zip) == 1:
|
||||
state = state_zip[0].strip()
|
||||
zip_code = None
|
||||
elif len(state_zip) == 2:
|
||||
state = state_zip[0].strip()
|
||||
zip_code = state_zip[1].strip()
|
||||
else:
|
||||
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||
|
||||
address_one, address_two = parse_address_one(address_one)
|
||||
return Address(
|
||||
address_one=address_one,
|
||||
address_two=address_two if address_two else "#",
|
||||
city=city,
|
||||
state=state,
|
||||
zip_code=zip_code,
|
||||
)
|
|
@ -1,38 +1,76 @@
|
|||
import re
|
||||
from .core.scrapers.models import Property
|
||||
import pandas as pd
|
||||
|
||||
ordered_properties = [
|
||||
"PropertyURL",
|
||||
"MLS",
|
||||
"MLS #",
|
||||
"Status",
|
||||
"Style",
|
||||
"Street",
|
||||
"Unit",
|
||||
"City",
|
||||
"State",
|
||||
"Zip",
|
||||
"Beds",
|
||||
"FB",
|
||||
"NumHB",
|
||||
"EstSF",
|
||||
"YrBlt",
|
||||
"ListPrice",
|
||||
"Lst Date",
|
||||
"Sold Price",
|
||||
"COEDate",
|
||||
"LotSFApx",
|
||||
"PrcSqft",
|
||||
"LATITUDE",
|
||||
"LONGITUDE",
|
||||
"Stories",
|
||||
"HOAFee",
|
||||
"PrkgGar",
|
||||
"Community",
|
||||
]
|
||||
|
||||
|
||||
def parse_address_one(street_address: str) -> tuple:
|
||||
if not street_address:
|
||||
return street_address, "#"
|
||||
def process_result(result: Property) -> pd.DataFrame:
|
||||
prop_data = {prop: None for prop in ordered_properties}
|
||||
prop_data.update(result.__dict__)
|
||||
prop_data["PropertyURL"] = prop_data["property_url"]
|
||||
prop_data["MLS"] = prop_data["mls"]
|
||||
prop_data["MLS #"] = prop_data["mls_id"]
|
||||
prop_data["Status"] = prop_data["status"]
|
||||
prop_data["Style"] = prop_data["style"]
|
||||
|
||||
apt_match = re.search(
|
||||
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
|
||||
street_address,
|
||||
re.I,
|
||||
)
|
||||
if "address" in prop_data:
|
||||
address_data = prop_data["address"]
|
||||
prop_data["Street"] = address_data.street
|
||||
prop_data["Unit"] = address_data.unit
|
||||
prop_data["City"] = address_data.city
|
||||
prop_data["State"] = address_data.state
|
||||
prop_data["Zip"] = address_data.zip
|
||||
|
||||
if apt_match:
|
||||
apt_str = apt_match.group().strip()
|
||||
cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I)
|
||||
prop_data["Community"] = prop_data["neighborhoods"]
|
||||
prop_data["Beds"] = prop_data["beds"]
|
||||
prop_data["FB"] = prop_data["baths_full"]
|
||||
prop_data["NumHB"] = prop_data["baths_half"]
|
||||
prop_data["EstSF"] = prop_data["est_sf"]
|
||||
prop_data["ListPrice"] = prop_data["list_price"]
|
||||
prop_data["Lst Date"] = prop_data["list_date"]
|
||||
prop_data["Sold Price"] = prop_data["sold_price"]
|
||||
prop_data["COEDate"] = prop_data["last_sold_date"]
|
||||
prop_data["LotSFApx"] = prop_data["lot_sf"]
|
||||
prop_data["HOAFee"] = prop_data["hoa_fee"]
|
||||
|
||||
main_address = street_address.replace(apt_str, "").strip()
|
||||
return main_address, cleaned_apt_str
|
||||
else:
|
||||
return street_address, "#"
|
||||
if prop_data.get("prc_sqft") is not None:
|
||||
prop_data["PrcSqft"] = round(prop_data["prc_sqft"], 2)
|
||||
|
||||
prop_data["YrBlt"] = prop_data["yr_blt"]
|
||||
prop_data["LATITUDE"] = prop_data["latitude"]
|
||||
prop_data["LONGITUDE"] = prop_data["longitude"]
|
||||
prop_data["Stories"] = prop_data["stories"]
|
||||
prop_data["PrkgGar"] = prop_data["prkg_gar"]
|
||||
|
||||
def parse_address_two(street_address: str):
|
||||
if not street_address:
|
||||
return "#"
|
||||
apt_match = re.search(
|
||||
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
|
||||
street_address,
|
||||
re.I,
|
||||
)
|
||||
properties_df = pd.DataFrame([prop_data])
|
||||
properties_df = properties_df.reindex(columns=ordered_properties)
|
||||
|
||||
if apt_match:
|
||||
apt_str = apt_match.group().strip()
|
||||
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I)
|
||||
return apt_str
|
||||
else:
|
||||
return "#"
|
||||
return properties_df[ordered_properties]
|
|
@ -10,7 +10,6 @@ from homeharvest.exceptions import (
|
|||
def test_realtor_comps():
|
||||
result = scrape_property(
|
||||
location="2530 Al Lipscomb Way",
|
||||
site_name="realtor.com",
|
||||
radius=0.5,
|
||||
)
|
||||
|
||||
|
@ -19,11 +18,11 @@ def test_realtor_comps():
|
|||
|
||||
def test_realtor_last_x_days_sold():
|
||||
days_result_30 = scrape_property(
|
||||
location="Dallas, TX", site_name="realtor.com", listing_type="sold", sold_last_x_days=30
|
||||
location="Dallas, TX", listing_type="sold", sold_last_x_days=30
|
||||
)
|
||||
|
||||
days_result_10 = scrape_property(
|
||||
location="Dallas, TX", site_name="realtor.com", listing_type="sold", sold_last_x_days=10
|
||||
location="Dallas, TX", listing_type="sold", sold_last_x_days=10
|
||||
)
|
||||
|
||||
assert all([result is not None for result in [days_result_30, days_result_10]]) and len(days_result_30) != len(days_result_10)
|
||||
|
@ -33,16 +32,15 @@ def test_realtor():
|
|||
results = [
|
||||
scrape_property(
|
||||
location="2530 Al Lipscomb Way",
|
||||
site_name="realtor.com",
|
||||
listing_type="for_sale",
|
||||
),
|
||||
scrape_property(
|
||||
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
|
||||
location="Phoenix, AZ", listing_type="for_rent"
|
||||
), #: does not support "city, state, USA" format
|
||||
scrape_property(
|
||||
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
|
||||
location="Dallas, TX", listing_type="sold"
|
||||
), #: does not support "city, state, USA" format
|
||||
scrape_property(location="85281", site_name="realtor.com"),
|
||||
scrape_property(location="85281"),
|
||||
]
|
||||
|
||||
assert all([result is not None for result in results])
|
||||
|
@ -52,7 +50,6 @@ def test_realtor():
|
|||
bad_results += [
|
||||
scrape_property(
|
||||
location="abceefg ju098ot498hh9",
|
||||
site_name="realtor.com",
|
||||
listing_type="for_sale",
|
||||
)
|
||||
]
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
from homeharvest import scrape_property
|
||||
from homeharvest.exceptions import (
|
||||
InvalidSite,
|
||||
InvalidListingType,
|
||||
NoResultsFound,
|
||||
GeoCoordsNotFound,
|
||||
SearchTooBroad,
|
||||
)
|
||||
|
||||
|
||||
def test_redfin():
|
||||
results = [
|
||||
scrape_property(location="San Diego", site_name="redfin", listing_type="for_sale"),
|
||||
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
|
||||
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
|
||||
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
|
||||
scrape_property(location="85281", site_name="redfin"),
|
||||
]
|
||||
|
||||
assert all([result is not None for result in results])
|
||||
|
||||
bad_results = []
|
||||
try:
|
||||
bad_results += [
|
||||
scrape_property(
|
||||
location="abceefg ju098ot498hh9",
|
||||
site_name="redfin",
|
||||
listing_type="for_sale",
|
||||
),
|
||||
scrape_property(location="Florida", site_name="redfin", listing_type="for_rent"),
|
||||
]
|
||||
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound, SearchTooBroad):
|
||||
assert True
|
||||
|
||||
assert all([result is None for result in bad_results])
|
|
@ -1,24 +0,0 @@
|
|||
from homeharvest.utils import parse_address_one, parse_address_two
|
||||
|
||||
|
||||
def test_parse_address_one():
|
||||
test_data = [
|
||||
("4303 E Cactus Rd Apt 126", ("4303 E Cactus Rd", "#126")),
|
||||
("1234 Elm Street apt 2B", ("1234 Elm Street", "#2B")),
|
||||
("1234 Elm Street UNIT 3A", ("1234 Elm Street", "#3A")),
|
||||
("1234 Elm Street unit 3A", ("1234 Elm Street", "#3A")),
|
||||
("1234 Elm Street SuIte 3A", ("1234 Elm Street", "#3A")),
|
||||
]
|
||||
|
||||
for input_data, (exp_addr_one, exp_addr_two) in test_data:
|
||||
address_one, address_two = parse_address_one(input_data)
|
||||
assert address_one == exp_addr_one
|
||||
assert address_two == exp_addr_two
|
||||
|
||||
|
||||
def test_parse_address_two():
|
||||
test_data = [("Apt 126", "#126"), ("apt 2B", "#2B"), ("UNIT 3A", "#3A"), ("unit 3A", "#3A"), ("SuIte 3A", "#3A")]
|
||||
|
||||
for input_data, expected in test_data:
|
||||
output = parse_address_two(input_data)
|
||||
assert output == expected
|
|
@ -1,34 +0,0 @@
|
|||
from homeharvest import scrape_property
|
||||
from homeharvest.exceptions import (
|
||||
InvalidSite,
|
||||
InvalidListingType,
|
||||
NoResultsFound,
|
||||
GeoCoordsNotFound,
|
||||
)
|
||||
|
||||
|
||||
def test_zillow():
|
||||
results = [
|
||||
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"),
|
||||
scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"),
|
||||
scrape_property(location="Surprise, AZ", site_name=["zillow"], listing_type="for_sale"),
|
||||
scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"),
|
||||
scrape_property(location="85281", site_name="zillow"),
|
||||
scrape_property(location="3268 88th st s, Lakewood", site_name="zillow", listing_type="for_rent"),
|
||||
]
|
||||
|
||||
assert all([result is not None for result in results])
|
||||
|
||||
bad_results = []
|
||||
try:
|
||||
bad_results += [
|
||||
scrape_property(
|
||||
location="abceefg ju098ot498hh9",
|
||||
site_name="zillow",
|
||||
listing_type="for_sale",
|
||||
)
|
||||
]
|
||||
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
||||
assert True
|
||||
|
||||
assert all([result is None for result in bad_results])
|
Loading…
Reference in New Issue