Compare commits

...

7 Commits

Author SHA1 Message Date
Zachary Hampton
ec036bb989 - optimizations & updated realtor headers 2024-05-20 12:13:30 -07:00
Zachary Hampton
aacd168545 - alt photos bug fix 2024-05-18 17:47:55 -07:00
Zachary Hampton
0d70007000 - alt photos bug fix 2024-05-16 23:04:07 -07:00
Zachary Hampton
018d3fbac4 - Python 3.9 support (tested) (could potentially work for lower versions, but I have not validated such) 2024-05-14 19:13:04 -07:00
Zachary Hampton
803fd618e9 - data cleaning & CONDOP bug fixes 2024-05-12 21:12:12 -07:00
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
Zachary Hampton
3458a08383 - broker data 2024-05-11 21:35:29 -07:00
8 changed files with 86 additions and 27 deletions

View File

@@ -21,7 +21,7 @@
```bash ```bash
pip install -U homeharvest pip install -U homeharvest
``` ```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
## Usage ## Usage

View File

@@ -49,11 +49,11 @@ def scrape_property(
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
results = site.search() results = site.search()
properties_dfs = [process_result(result) for result in results] properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""}) return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
@@ -5,6 +6,7 @@ from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName
import json
@dataclass @dataclass
@@ -70,18 +72,25 @@ class Scraper:
@staticmethod @staticmethod
def get_access_token(): def get_access_token():
url = "https://graph.realtor.com/auth/token" device_id = str(uuid.uuid4()).upper()
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}' response = requests.post(
"https://graph.realtor.com/auth/token",
headers={ headers={
"Host": "graph.realtor.com", 'Host': 'graph.realtor.com',
"x-client-version": "24.20.4.149916", 'Accept': '*/*',
"accept": "*/*", 'Content-Type': 'Application/json',
"content-type": "Application/json", 'X-Client-ID': 'rdc_mobile_native,iphone',
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0", 'X-Visitor-ID': device_id,
"accept-language": "en-US,en;q=0.9", 'X-Client-Version': '24.21.23.679885',
} 'Accept-Language': 'en-US,en;q=0.9',
response = requests.post(url, headers=headers, data=payload) 'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0',
},
data=json.dumps({
"grant_type": "device_mobile",
"device_id": device_id,
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone"
}))
data = response.json() data = response.json()

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Optional from typing import Optional
@@ -36,6 +37,7 @@ class PropertyType(Enum):
CONDO_TOWNHOME = "CONDO_TOWNHOME" CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP" CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO" CONDO = "CONDO"
CONDOP = "CONDOP"
CONDOS = "CONDOS" CONDOS = "CONDOS"
COOP = "COOP" COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX" DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
@@ -52,6 +54,7 @@ class PropertyType(Enum):
@dataclass @dataclass
class Address: class Address:
full_line: str | None = None
street: str | None = None street: str | None = None
unit: str | None = None unit: str | None = None
city: str | None = None city: str | None = None
@@ -121,7 +124,8 @@ class Property:
neighborhoods: Optional[str] = None neighborhoods: Optional[str] = None
county: Optional[str] = None county: Optional[str] = None
fips_code: Optional[str] = None fips_code: Optional[str] = None
agents: list[Agent] = None agents: list[Agent] | None = None
brokers: list[Broker] | None = None
nearby_schools: list[str] = None nearby_schools: list[str] = None
assessed_value: int | None = None assessed_value: int | None = None
estimated_value: int | None = None estimated_value: int | None = None

View File

@@ -4,13 +4,13 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com This module implements the scraper for realtor.com
""" """
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from .. import Scraper from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id listing_id
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -165,7 +166,7 @@ class RealtorScraper(Scraper):
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None, longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(property_info, search_type="handle_listing"), address=self._parse_address(property_info, search_type="handle_listing"),
description=Description( description=Description(
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])), alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
style=property_info["basic"].get("type", "").upper(), style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"), beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"), baths_full=property_info["basic"].get("baths_full"),
@@ -180,6 +181,7 @@ class RealtorScraper(Scraper):
), ),
days_on_mls=days_on_mls, days_on_mls=days_on_mls,
agents=prop_details.get("agents"), agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"), estimated_value=prop_details.get("estimated_value"),
@@ -235,6 +237,7 @@ class RealtorScraper(Scraper):
stories stories
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -295,6 +298,7 @@ class RealtorScraper(Scraper):
address=self._parse_address(property_info, search_type="handle_address"), address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info), description=self._parse_description(property_info),
agents=prop_details.get("agents"), agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"), estimated_value=prop_details.get("estimated_value"),
@@ -350,6 +354,7 @@ class RealtorScraper(Scraper):
street_number street_number
street_name street_name
street_suffix street_suffix
line
unit unit
city city
state_code state_code
@@ -553,6 +558,7 @@ class RealtorScraper(Scraper):
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None, fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result), days_on_mls=self.calculate_days_on_mls(result),
agents=prop_details.get("agents"), agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"), estimated_value=prop_details.get("estimated_value"),
@@ -654,6 +660,8 @@ class RealtorScraper(Scraper):
if not self.extra_property_data: if not self.extra_property_data:
return {} return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) { query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) { home(property_id: $property_id) {
__typename __typename
@@ -666,6 +674,12 @@ class RealtorScraper(Scraper):
phones { number type ext primary } phones { number type ext primary }
} }
consumer_advertisers {
name
phone
href
type
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } } __typename schools { district { __typename id name } }
@@ -700,7 +714,9 @@ class RealtorScraper(Scraper):
except (KeyError, TypeError, IndexError): except (KeyError, TypeError, IndexError):
return {} return {}
ads = get_key(["data", "home", "advertisers"]) agents = get_key(["data", "home", "advertisers"])
advertisers = get_key(["data", "home", "consumer_advertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"]) schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"]) assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"]) estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
@@ -709,11 +725,18 @@ class RealtorScraper(Scraper):
name=ad["name"], name=ad["name"],
email=ad["email"], email=ad["email"],
phones=ad["phones"] phones=ad["phones"]
) for ad in ads] ) for ad in agents]
brokers = [Broker(
name=ad["name"],
phone=ad["phone"],
website=ad["href"]
) for ad in advertisers if ad.get("type") != "Agent"]
schools = [school["district"]["name"] for school in schools if school['district'].get('name')] schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
return { return {
"agents": agents if agents else None, "agents": agents if agents else None,
"brokers": brokers if brokers else None,
"schools": schools if schools else None, "schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None, "assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None, "estimated_value": estimated_value if estimated_value else None,
@@ -747,6 +770,7 @@ class RealtorScraper(Scraper):
address = result["address"] address = result["address"]
return Address( return Address(
full_line=address.get("line"),
street=" ".join( street=" ".join(
part for part in [ part for part in [
address.get("street_number"), address.get("street_number"),
@@ -781,8 +805,8 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")), alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType(style) if style else None, style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType, Agent from .core.scrapers.models import Property, ListingType, Agent
@@ -10,6 +11,7 @@ ordered_properties = [
"status", "status",
"text", "text",
"style", "style",
"full_street_line",
"street", "street",
"unit", "unit",
"city", "city",
@@ -40,6 +42,9 @@ ordered_properties = [
"agent", "agent",
"agent_email", "agent_email",
"agent_phones", "agent_phones",
"broker",
"broker_phone",
"broker_website",
"nearby_schools", "nearby_schools",
"primary_photo", "primary_photo",
"alt_photos", "alt_photos",
@@ -52,6 +57,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data: if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city prop_data["city"] = address_data.city
@@ -65,13 +71,20 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["agent_email"] = agents[0].email prop_data["agent_email"] = agents[0].email
prop_data["agent_phones"] = agents[0].phones prop_data["agent_phones"] = agents[0].phones
if "brokers" in prop_data:
brokers = prop_data["brokers"]
if brokers:
prop_data["broker"] = brokers[0].name
prop_data["broker_phone"] = brokers[0].phone
prop_data["broker_website"] = brokers[0].website
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
description = result.description description = result.description
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if type(description.style) == str else description.style.value prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.21" version = "0.3.28"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -10,7 +10,7 @@ readme = "README.md"
homeharvest = "homeharvest.cli:main" homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.10,<3.13" python = ">=3.9,<3.13"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.1" pandas = "^2.1.1"

View File

@@ -155,4 +155,13 @@ def test_realtor_without_extra_details():
), ),
] ]
assert results[0] != results[1] assert not results[0].equals(results[1])
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0