enh: assessed/estimated value (#77)

pull/82/head v0.3.20
Cullen Watson 2024-04-30 15:29:54 -05:00 committed by GitHub
parent c5b15e9be5
commit 04ae968716
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 56 additions and 29 deletions

View File

@ -1,6 +1,7 @@
import uuid
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid import uuid
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName
@ -19,24 +20,30 @@ class ScraperInput:
class Scraper: class Scraper:
session = None
def __init__( def __init__(
self, self,
scraper_input: ScraperInput, scraper_input: ScraperInput,
session: requests.Session = None,
): ):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
if not session: if not self.session:
self.session = requests.Session() Scraper.session = requests.Session()
self.session.headers.update( retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{ {
"auth": f"Bearer {self.get_access_token()}", "auth": f"Bearer {self.get_access_token()}",
"apollographql-client-name": "com.move.Realtor-apollo-ios", "apollographql-client-name": "com.move.Realtor-apollo-ios",
} }
) )
else:
self.session = session
if scraper_input.proxy: if scraper_input.proxy:
proxy_url = scraper_input.proxy proxy_url = scraper_input.proxy
@ -73,4 +80,8 @@ class Scraper:
response = requests.post(url, headers=headers, data=payload) response = requests.post(url, headers=headers, data=payload)
data = response.json() data = response.json()
return data["access_token"] try:
access_token = data["access_token"]
except Exception:
raise Exception("Could not get access token, use a proxy/vpn or wait")
return access_token

View File

@ -106,3 +106,5 @@ class Property:
fips_code: Optional[str] = None fips_code: Optional[str] = None
agents: list[Agent] = None agents: list[Agent] = None
nearby_schools: list[str] = None nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None

View File

@ -5,9 +5,9 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com This module implements the scraper for realtor.com
""" """
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from .. import Scraper from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent from ..models import Property, Address, ListingType, Description, PropertyType, Agent
@ -142,7 +142,7 @@ class RealtorScraper(Scraper):
days_on_mls = None days_on_mls = None
property_id = property_info["details"]["permalink"] property_id = property_info["details"]["permalink"]
agents_schools = self.get_agents_schools(property_id) prop_details = self.get_prop_details(property_id)
listing = Property( listing = Property(
mls=mls, mls=mls,
mls_id=( mls_id=(
@ -176,11 +176,13 @@ class RealtorScraper(Scraper):
year_built=property_info["details"].get("year_built"), year_built=property_info["details"].get("year_built"),
garage=property_info["details"].get("garage"), garage=property_info["details"].get("garage"),
stories=property_info["details"].get("stories"), stories=property_info["details"].get("stories"),
text=property_info["description"].get("text"), text=property_info.get("description", {}).get("text"),
), ),
days_on_mls=days_on_mls, days_on_mls=days_on_mls,
agents=agents_schools["agents"], agents=prop_details.get("agents"),
nearby_schools=agents_schools["schools"], nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
) )
return [listing] return [listing]
@ -274,7 +276,7 @@ class RealtorScraper(Scraper):
}""" }"""
variables = {"property_id": property_id} variables = {"property_id": property_id}
agents_schools = self.get_agents_schools(property_id) prop_details = self.get_prop_details(property_id)
payload = { payload = {
"query": query, "query": query,
@ -292,8 +294,10 @@ class RealtorScraper(Scraper):
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}", property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
address=self._parse_address(property_info, search_type="handle_address"), address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info), description=self._parse_description(property_info),
agents=agents_schools["agents"], agents=prop_details.get("agents"),
nearby_schools=agents_schools["schools"], nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
) )
] ]
@ -486,7 +490,6 @@ class RealtorScraper(Scraper):
} }
response = self.session.post(self.SEARCH_GQL_URL, json=payload) response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response.raise_for_status()
response_json = response.json() response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search" search_key = "home_search" if "home_search" in query else "property_search"
@ -521,7 +524,7 @@ class RealtorScraper(Scraper):
return return
property_id = result["property_id"] property_id = result["property_id"]
agents_schools = self.get_agents_schools(property_id) prop_details = self.get_prop_details(property_id)
realty_property = Property( realty_property = Property(
mls=mls, mls=mls,
@ -546,11 +549,13 @@ class RealtorScraper(Scraper):
address=self._parse_address(result, search_type="general_search"), address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result), description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result), neighborhoods=self._parse_neighborhoods(result),
county=result["location"]["county"].get("name"), county=result["location"]["county"].get("name") if result["location"]["county"] else None,
fips_code=result["location"]["county"].get("fips_code"), fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result), days_on_mls=self.calculate_days_on_mls(result),
agents=agents_schools["agents"], agents=prop_details.get("agents"),
nearby_schools=agents_schools["schools"], nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
) )
return realty_property return realty_property
@ -645,8 +650,8 @@ class RealtorScraper(Scraper):
return homes return homes
def get_agents_schools(self, property_id: str) -> dict: def get_prop_details(self, property_id: str) -> dict:
payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {{ __typename schools {{ district {{ __typename id name }} }} }}}}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}' payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {{ __typename schools {{ district {{ __typename id name }} }} }} taxHistory: tax_history {{ __typename tax year assessment {{ __typename building land total }} }}estimates {{ __typename currentValues: current_values {{ __typename source {{ __typename type name }} estimate estimateHigh: estimate_high estimateLow: estimate_low date isBestHomeValue: isbest_homevalue }} }} }}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}'
response = self.session.post(self.PROPERTY_GQL, data=payload) response = self.session.post(self.PROPERTY_GQL, data=payload)
def get_key(keys: list): def get_key(keys: list):
@ -656,14 +661,22 @@ class RealtorScraper(Scraper):
data = data[key] data = data[key]
return data return data
except (KeyError, TypeError): except (KeyError, TypeError):
return [] return {}
ads = get_key(["data", "home", "consumerAdvertisers"]) ads = get_key(["data", "home", "consumerAdvertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"]) schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
agents = [Agent(name=ad["name"], phone=ad["phone"]) for ad in ads] agents = [Agent(name=ad["name"], phone=ad["phone"]) for ad in ads]
schools = [school["district"]["name"] for school in schools] schools = [school["district"]["name"] for school in schools]
return {"agents": agents, "schools": schools} return {
"agents": agents if agents else None,
"schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None,
}
@staticmethod @staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]: def _parse_neighborhoods(result: dict) -> Optional[str]:

View File

@ -25,6 +25,8 @@ ordered_properties = [
"list_date", "list_date",
"sold_price", "sold_price",
"last_sold_date", "last_sold_date",
"assessed_value",
"estimated_value",
"lot_sqft", "lot_sqft",
"price_per_sqft", "price_per_sqft",
"latitude", "latitude",
@ -71,7 +73,7 @@ def process_result(result: Property) -> pd.DataFrame:
description = result.description description = result.description
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["style"] = description.style.value prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half
@ -83,7 +85,6 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["stories"] = description.stories prop_data["stories"] = description.stories
prop_data["text"] = description.text prop_data["text"] = description.text
properties_df = pd.DataFrame([prop_data]) properties_df = pd.DataFrame([prop_data])
properties_df = properties_df.reindex(columns=ordered_properties) properties_df = properties_df.reindex(columns=ordered_properties)

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.19" version = "0.3.20"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"