- various data quality fixes (including #70)

pull/82/head
Zachary Hampton 2024-05-02 08:48:53 -07:00
parent 04ae968716
commit 46985dcee4
5 changed files with 88 additions and 29 deletions

View File

@ -13,9 +13,10 @@ def scrape_property(
mls_only: bool = False, mls_only: bool = False,
past_days: int = None, past_days: int = None,
proxy: str = None, proxy: str = None,
date_from: str = None, date_from: str = None, #: TODO: Switch to one parameter, Date, with date_from and date_to, pydantic validation
date_to: str = None, date_to: str = None,
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@ -23,9 +24,11 @@ def scrape_property(
:param listing_type: Listing Type (for_sale, for_rent, sold) :param listing_type: Listing Type (for_sale, for_rent, sold)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
:param proxy: Proxy to use for scraping
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days. :param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28 :param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param proxy: Proxy to use for scraping :param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
@ -51,4 +54,5 @@ def scrape_property(
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""})

View File

@ -76,10 +76,27 @@ class Description:
text: str | None = None text: str | None = None
@dataclass
class AgentPhone: #: For documentation purposes only (at the moment)
number: str | None = None
type: str | None = None
primary: bool | None = None
ext: str | None = None
@dataclass @dataclass
class Agent: class Agent:
name: str | None = None
phones: list[dict] | AgentPhone | None = None
email: str | None = None
href: str | None = None
@dataclass
class Broker:
name: str | None = None name: str | None = None
phone: str | None = None phone: str | None = None
website: str | None = None
@dataclass @dataclass

View File

@ -651,26 +651,64 @@ class RealtorScraper(Scraper):
return homes return homes
def get_prop_details(self, property_id: str) -> dict: def get_prop_details(self, property_id: str) -> dict:
payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {{ __typename schools {{ district {{ __typename id name }} }} }} taxHistory: tax_history {{ __typename tax year assessment {{ __typename building land total }} }}estimates {{ __typename currentValues: current_values {{ __typename source {{ __typename type name }} estimate estimateHigh: estimate_high estimateLow: estimate_low date isBestHomeValue: isbest_homevalue }} }} }}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}' query = """query GetHome($property_id: ID!) {
response = self.session.post(self.PROPERTY_GQL, data=payload) home(property_id: $property_id) {
__typename
advertisers {
__typename
type
name
email
phones { number type ext primary }
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
estimates {
__typename
currentValues: current_values {
__typename
source { __typename type name }
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
}
}"""
variables = {"property_id": property_id}
response = self.session.post(self.PROPERTY_GQL, json={"query": query, "variables": variables})
data = response.json()
def get_key(keys: list): def get_key(keys: list):
try: try:
data = response.json() value = data
for key in keys: for key in keys:
data = data[key] value = value[key]
return data
except (KeyError, TypeError): return value or {}
except (KeyError, TypeError, IndexError):
return {} return {}
ads = get_key(["data", "home", "consumerAdvertisers"]) ads = get_key(["data", "home", "advertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"]) schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"]) assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"]) estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
agents = [Agent(name=ad["name"], phone=ad["phone"]) for ad in ads] agents = [Agent(
name=ad["name"],
email=ad["email"],
phones=ad["phones"]
) for ad in ads]
schools = [school["district"]["name"] for school in schools] schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
return { return {
"agents": agents if agents else None, "agents": agents if agents else None,
"schools": schools if schools else None, "schools": schools if schools else None,
@ -698,7 +736,8 @@ class RealtorScraper(Scraper):
return address_part return address_part
def _parse_address(self, result: dict, search_type): @staticmethod
def _parse_address(result: dict, search_type):
if search_type == "general_search": if search_type == "general_search":
address = result["location"]["address"] address = result["location"]["address"]
else: else:
@ -706,12 +745,12 @@ class RealtorScraper(Scraper):
return Address( return Address(
street=" ".join( street=" ".join(
[ part for part in [
self.handle_none_safely(address.get("street_number")), address.get("street_number"),
self.handle_none_safely(address.get("street_direction")), address.get("street_direction"),
self.handle_none_safely(address.get("street_name")), address.get("street_name"),
self.handle_none_safely(address.get("street_suffix")), address.get("street_suffix"),
] ] if part is not None
).strip(), ).strip(),
unit=address["unit"], unit=address["unit"],
city=address["city"], city=address["city"],
@ -746,7 +785,7 @@ class RealtorScraper(Scraper):
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),
sqft=description_data.get("sqft"), sqft=description_data.get("sqft"),
lot_sqft=description_data.get("lot_sqft"), lot_sqft=description_data.get("lot_sqft"),
sold_price=description_data.get("sold_price"), sold_price=description_data.get("sold_price") if result.get('last_sold_date') or result["list_price"] != description_data.get("sold_price") else None, #: has a sold date or list and sold price are different
year_built=description_data.get("year_built"), year_built=description_data.get("year_built"),
garage=description_data.get("garage"), garage=description_data.get("garage"),
stories=description_data.get("stories"), stories=description_data.get("stories"),

View File

@ -1,6 +1,6 @@
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType from .core.scrapers.models import Property, ListingType, Agent
from .exceptions import InvalidListingType, InvalidDate from .exceptions import InvalidListingType, InvalidDate
ordered_properties = [ ordered_properties = [
@ -38,8 +38,8 @@ ordered_properties = [
"hoa_fee", "hoa_fee",
"parking_garage", "parking_garage",
"agent", "agent",
"broker", "agent_email",
"broker_phone", "agent_phones",
"nearby_schools", "nearby_schools",
"primary_photo", "primary_photo",
"alt_photos", "alt_photos",
@ -59,12 +59,11 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["zip_code"] = address_data.zip prop_data["zip_code"] = address_data.zip
if "agents" in prop_data: if "agents" in prop_data:
agents = prop_data["agents"] agents: list[Agent] | None = prop_data["agents"]
if agents: if agents:
prop_data["agent"] = agents[0].name prop_data["agent"] = agents[0].name
if len(agents) > 1: prop_data["agent_email"] = agents[0].email
prop_data["broker"] = agents[1].name prop_data["agent_phones"] = agents[0].phones
prop_data["broker_phone"] = agents[1].phone
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
@ -107,5 +106,5 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None:
if date_to_obj < date_from_obj: if date_to_obj < date_from_obj:
raise InvalidDate("date_to must be after date_from.") raise InvalidDate("date_to must be after date_from.")
except ValueError as e: except ValueError:
raise InvalidDate(f"Invalid date format or range") raise InvalidDate(f"Invalid date format or range")

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.20" version = "0.3.21"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"