Compare commits

...

3 Commits

Author SHA1 Message Date
Zachary Hampton
803fd618e9 - data cleaning & CONDOP bug fixes 2024-05-12 21:12:12 -07:00
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
Zachary Hampton
3458a08383 - broker data 2024-05-11 21:35:29 -07:00
6 changed files with 58 additions and 8 deletions

View File

@@ -53,7 +53,9 @@ def scrape_property(
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
properties_dfs = [df for df in properties_dfs if not df.empty]
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""}) return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})

View File

@@ -36,6 +36,7 @@ class PropertyType(Enum):
CONDO_TOWNHOME = "CONDO_TOWNHOME" CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP" CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO" CONDO = "CONDO"
CONDOP = "CONDOP"
CONDOS = "CONDOS" CONDOS = "CONDOS"
COOP = "COOP" COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX" DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
@@ -52,6 +53,7 @@ class PropertyType(Enum):
@dataclass @dataclass
class Address: class Address:
full_line: str | None = None
street: str | None = None street: str | None = None
unit: str | None = None unit: str | None = None
city: str | None = None city: str | None = None
@@ -121,7 +123,8 @@ class Property:
neighborhoods: Optional[str] = None neighborhoods: Optional[str] = None
county: Optional[str] = None county: Optional[str] = None
fips_code: Optional[str] = None fips_code: Optional[str] = None
agents: list[Agent] = None agents: list[Agent] | None = None
brokers: list[Broker] | None = None
nearby_schools: list[str] = None nearby_schools: list[str] = None
assessed_value: int | None = None assessed_value: int | None = None
estimated_value: int | None = None estimated_value: int | None = None

View File

@@ -10,7 +10,7 @@ from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from .. import Scraper from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id listing_id
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -180,6 +181,7 @@ class RealtorScraper(Scraper):
), ),
days_on_mls=days_on_mls, days_on_mls=days_on_mls,
agents=prop_details.get("agents"), agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"), estimated_value=prop_details.get("estimated_value"),
@@ -235,6 +237,7 @@ class RealtorScraper(Scraper):
stories stories
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -295,6 +298,7 @@ class RealtorScraper(Scraper):
address=self._parse_address(property_info, search_type="handle_address"), address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info), description=self._parse_description(property_info),
agents=prop_details.get("agents"), agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"), estimated_value=prop_details.get("estimated_value"),
@@ -350,6 +354,7 @@ class RealtorScraper(Scraper):
street_number street_number
street_name street_name
street_suffix street_suffix
line
unit unit
city city
state_code state_code
@@ -553,6 +558,7 @@ class RealtorScraper(Scraper):
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None, fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result), days_on_mls=self.calculate_days_on_mls(result),
agents=prop_details.get("agents"), agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"), estimated_value=prop_details.get("estimated_value"),
@@ -654,6 +660,8 @@ class RealtorScraper(Scraper):
if not self.extra_property_data: if not self.extra_property_data:
return {} return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) { query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) { home(property_id: $property_id) {
__typename __typename
@@ -666,6 +674,12 @@ class RealtorScraper(Scraper):
phones { number type ext primary } phones { number type ext primary }
} }
consumer_advertisers {
name
phone
href
type
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } } __typename schools { district { __typename id name } }
@@ -700,7 +714,9 @@ class RealtorScraper(Scraper):
except (KeyError, TypeError, IndexError): except (KeyError, TypeError, IndexError):
return {} return {}
ads = get_key(["data", "home", "advertisers"]) agents = get_key(["data", "home", "advertisers"])
advertisers = get_key(["data", "home", "consumer_advertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"]) schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"]) assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"]) estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
@@ -709,11 +725,18 @@ class RealtorScraper(Scraper):
name=ad["name"], name=ad["name"],
email=ad["email"], email=ad["email"],
phones=ad["phones"] phones=ad["phones"]
) for ad in ads] ) for ad in agents]
brokers = [Broker(
name=ad["name"],
phone=ad["phone"],
website=ad["href"]
) for ad in advertisers if ad.get("type") != "Agent"]
schools = [school["district"]["name"] for school in schools if school['district'].get('name')] schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
return { return {
"agents": agents if agents else None, "agents": agents if agents else None,
"brokers": brokers if brokers else None,
"schools": schools if schools else None, "schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None, "assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None, "estimated_value": estimated_value if estimated_value else None,
@@ -747,6 +770,7 @@ class RealtorScraper(Scraper):
address = result["address"] address = result["address"]
return Address( return Address(
full_line=address.get("line"),
street=" ".join( street=" ".join(
part for part in [ part for part in [
address.get("street_number"), address.get("street_number"),
@@ -782,7 +806,7 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")), alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
style=PropertyType(style) if style else None, style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),

View File

@@ -10,6 +10,7 @@ ordered_properties = [
"status", "status",
"text", "text",
"style", "style",
"full_street_line",
"street", "street",
"unit", "unit",
"city", "city",
@@ -40,6 +41,9 @@ ordered_properties = [
"agent", "agent",
"agent_email", "agent_email",
"agent_phones", "agent_phones",
"broker",
"broker_phone",
"broker_website",
"nearby_schools", "nearby_schools",
"primary_photo", "primary_photo",
"alt_photos", "alt_photos",
@@ -52,6 +56,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data: if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city prop_data["city"] = address_data.city
@@ -65,6 +70,13 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["agent_email"] = agents[0].email prop_data["agent_email"] = agents[0].email
prop_data["agent_phones"] = agents[0].phones prop_data["agent_phones"] = agents[0].phones
if "brokers" in prop_data:
brokers = prop_data["brokers"]
if brokers:
prop_data["broker"] = brokers[0].name
prop_data["broker_phone"] = brokers[0].phone
prop_data["broker_website"] = brokers[0].website
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.21" version = "0.3.24"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@@ -156,3 +156,12 @@ def test_realtor_without_extra_details():
] ]
assert results[0] != results[1] assert results[0] != results[1]
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0