Compare commits

..

2 Commits

Author SHA1 Message Date
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
Zachary Hampton
3458a08383 - broker data 2024-05-11 21:35:29 -07:00
5 changed files with 53 additions and 6 deletions

View File

@@ -52,6 +52,7 @@ class PropertyType(Enum):
@dataclass
class Address:
full_line: str | None = None
street: str | None = None
unit: str | None = None
city: str | None = None
@@ -121,7 +122,8 @@ class Property:
neighborhoods: Optional[str] = None
county: Optional[str] = None
fips_code: Optional[str] = None
agents: list[Agent] = None
agents: list[Agent] | None = None
brokers: list[Broker] | None = None
nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None

View File

@@ -10,7 +10,7 @@ from datetime import datetime
from typing import Dict, Union, Optional
from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent
from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker
class RealtorScraper(Scraper):
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id
}
address {
line
street_direction
street_number
street_name
@@ -180,6 +181,7 @@ class RealtorScraper(Scraper):
),
days_on_mls=days_on_mls,
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
@@ -235,6 +237,7 @@ class RealtorScraper(Scraper):
stories
}
address {
line
street_direction
street_number
street_name
@@ -295,6 +298,7 @@ class RealtorScraper(Scraper):
address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info),
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
@@ -350,6 +354,7 @@ class RealtorScraper(Scraper):
street_number
street_name
street_suffix
line
unit
city
state_code
@@ -553,6 +558,7 @@ class RealtorScraper(Scraper):
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result),
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
@@ -654,6 +660,8 @@ class RealtorScraper(Scraper):
if not self.extra_property_data:
return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) {
__typename
@@ -665,7 +673,13 @@ class RealtorScraper(Scraper):
email
phones { number type ext primary }
}
consumer_advertisers {
name
phone
href
type
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
@@ -700,7 +714,9 @@ class RealtorScraper(Scraper):
except (KeyError, TypeError, IndexError):
return {}
ads = get_key(["data", "home", "advertisers"])
agents = get_key(["data", "home", "advertisers"])
advertisers = get_key(["data", "home", "consumer_advertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
@@ -709,11 +725,18 @@ class RealtorScraper(Scraper):
name=ad["name"],
email=ad["email"],
phones=ad["phones"]
) for ad in ads]
) for ad in agents]
brokers = [Broker(
name=ad["name"],
phone=ad["phone"],
website=ad["href"]
) for ad in advertisers if ad.get("type") != "Agent"]
schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
return {
"agents": agents if agents else None,
"brokers": brokers if brokers else None,
"schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None,
@@ -747,6 +770,7 @@ class RealtorScraper(Scraper):
address = result["address"]
return Address(
full_line=address.get("line"),
street=" ".join(
part for part in [
address.get("street_number"),

View File

@@ -10,6 +10,7 @@ ordered_properties = [
"status",
"text",
"style",
"full_street_line",
"street",
"unit",
"city",
@@ -40,6 +41,9 @@ ordered_properties = [
"agent",
"agent_email",
"agent_phones",
"broker",
"broker_phone",
"broker_website",
"nearby_schools",
"primary_photo",
"alt_photos",
@@ -52,6 +56,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data:
address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city
@@ -65,6 +70,13 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["agent_email"] = agents[0].email
prop_data["agent_phones"] = agents[0].phones
if "brokers" in prop_data:
brokers = prop_data["brokers"]
if brokers:
prop_data["broker"] = brokers[0].name
prop_data["broker_phone"] = brokers[0].phone
prop_data["broker_website"] = brokers[0].website
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.21"
version = "0.3.23"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@@ -156,3 +156,12 @@ def test_realtor_without_extra_details():
]
assert results[0] != results[1]
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0