Compare commits

..

8 Commits

Author SHA1 Message Date
Zachary Hampton
93e6778a48 - exclude_pending parameter 2024-05-31 22:17:29 -07:00
Zachary Hampton
ec036bb989 - optimizations & updated realtor headers 2024-05-20 12:13:30 -07:00
Zachary Hampton
aacd168545 - alt photos bug fix 2024-05-18 17:47:55 -07:00
Zachary Hampton
0d70007000 - alt photos bug fix 2024-05-16 23:04:07 -07:00
Zachary Hampton
018d3fbac4 - Python 3.9 support (tested) (could potentially work for lower versions, but I have not validated such) 2024-05-14 19:13:04 -07:00
Zachary Hampton
803fd618e9 - data cleaning & CONDOP bug fixes 2024-05-12 21:12:12 -07:00
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
Zachary Hampton
3458a08383 - broker data 2024-05-11 21:35:29 -07:00
8 changed files with 113 additions and 32 deletions

View File

@@ -21,7 +21,7 @@
```bash
pip install -U homeharvest
```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
_Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
## Usage
@@ -90,9 +90,11 @@ Optional
├── foreclosure (True/False): If set, fetches only foreclosures
── proxy (string): In format 'http://user:pass@host:port'
── proxy (string): In format 'http://user:pass@host:port'
── extra_property_data (bool): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
```
### Property Schema
@@ -142,6 +144,11 @@ Property
│ ├── agent
│ ├── agent_email
│ └── agent_phone
├── Broker Info:
│ ├── broker
│ ├── broker_email
│ └── broker_website
```
### Exceptions

View File

@@ -17,11 +17,12 @@ def scrape_property(
date_to: str = None,
foreclosure: bool = None,
extra_property_data: bool = True,
exclude_pending: bool = False,
) -> pd.DataFrame:
"""
Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold)
:param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs.
:param proxy: Proxy to use for scraping
@@ -29,6 +30,7 @@ def scrape_property(
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
"""
validate_input(listing_type)
validate_dates(date_from, date_to)
@@ -44,16 +46,17 @@ def scrape_property(
date_to=date_to,
foreclosure=foreclosure,
extra_property_data=extra_property_data,
exclude_pending=exclude_pending,
)
site = RealtorScraper(scraper_input)
results = site.search()
properties_dfs = [process_result(result) for result in results]
properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs:
return pd.DataFrame()
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""})
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
import requests
from requests.adapters import HTTPAdapter
@@ -5,6 +6,7 @@ from urllib3.util.retry import Retry
import uuid
from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName
import json
@dataclass
@@ -19,6 +21,7 @@ class ScraperInput:
date_to: str | None = None
foreclosure: bool | None = False
extra_property_data: bool | None = True
exclude_pending: bool | None = False
class Scraper:
@@ -60,6 +63,7 @@ class Scraper:
self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending
def search(self) -> list[Property]: ...
@@ -70,18 +74,25 @@ class Scraper:
@staticmethod
def get_access_token():
url = "https://graph.realtor.com/auth/token"
device_id = str(uuid.uuid4()).upper()
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}'
headers = {
"Host": "graph.realtor.com",
"x-client-version": "24.20.4.149916",
"accept": "*/*",
"content-type": "Application/json",
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0",
"accept-language": "en-US,en;q=0.9",
}
response = requests.post(url, headers=headers, data=payload)
response = requests.post(
"https://graph.realtor.com/auth/token",
headers={
'Host': 'graph.realtor.com',
'Accept': '*/*',
'Content-Type': 'Application/json',
'X-Client-ID': 'rdc_mobile_native,iphone',
'X-Visitor-ID': device_id,
'X-Client-Version': '24.21.23.679885',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0',
},
data=json.dumps({
"grant_type": "device_mobile",
"device_id": device_id,
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone"
}))
data = response.json()

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from typing import Optional
@@ -36,6 +37,7 @@ class PropertyType(Enum):
CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO"
CONDOP = "CONDOP"
CONDOS = "CONDOS"
COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
@@ -52,6 +54,7 @@ class PropertyType(Enum):
@dataclass
class Address:
full_line: str | None = None
street: str | None = None
unit: str | None = None
city: str | None = None
@@ -121,7 +124,8 @@ class Property:
neighborhoods: Optional[str] = None
county: Optional[str] = None
fips_code: Optional[str] = None
agents: list[Agent] = None
agents: list[Agent] | None = None
brokers: list[Broker] | None = None
nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None

View File

@@ -4,13 +4,13 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com
"""
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import Dict, Union, Optional
from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent
from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker
class RealtorScraper(Scraper):
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id
}
address {
line
street_direction
street_number
street_name
@@ -165,7 +166,7 @@ class RealtorScraper(Scraper):
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(property_info, search_type="handle_listing"),
description=Description(
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])),
alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"),
@@ -180,6 +181,7 @@ class RealtorScraper(Scraper):
),
days_on_mls=days_on_mls,
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
@@ -235,6 +237,7 @@ class RealtorScraper(Scraper):
stories
}
address {
line
street_direction
street_number
street_name
@@ -295,6 +298,7 @@ class RealtorScraper(Scraper):
address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info),
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
@@ -350,6 +354,7 @@ class RealtorScraper(Scraper):
street_number
street_name
street_suffix
line
unit
city
state_code
@@ -520,7 +525,7 @@ class RealtorScraper(Scraper):
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
if is_pending and self.listing_type != ListingType.PENDING:
if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING):
return
property_id = result["property_id"]
@@ -553,6 +558,7 @@ class RealtorScraper(Scraper):
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result),
agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
@@ -654,6 +660,8 @@ class RealtorScraper(Scraper):
if not self.extra_property_data:
return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) {
__typename
@@ -665,7 +673,13 @@ class RealtorScraper(Scraper):
email
phones { number type ext primary }
}
consumer_advertisers {
name
phone
href
type
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
@@ -700,7 +714,9 @@ class RealtorScraper(Scraper):
except (KeyError, TypeError, IndexError):
return {}
ads = get_key(["data", "home", "advertisers"])
agents = get_key(["data", "home", "advertisers"])
advertisers = get_key(["data", "home", "consumer_advertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
@@ -709,11 +725,18 @@ class RealtorScraper(Scraper):
name=ad["name"],
email=ad["email"],
phones=ad["phones"]
) for ad in ads]
) for ad in agents]
brokers = [Broker(
name=ad["name"],
phone=ad["phone"],
website=ad["href"]
) for ad in advertisers if ad.get("type") != "Agent"]
schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
return {
"agents": agents if agents else None,
"brokers": brokers if brokers else None,
"schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None,
@@ -747,6 +770,7 @@ class RealtorScraper(Scraper):
address = result["address"]
return Address(
full_line=address.get("line"),
street=" ".join(
part for part in [
address.get("street_number"),
@@ -781,8 +805,8 @@ class RealtorScraper(Scraper):
return Description(
primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
style=PropertyType(style) if style else None,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"),

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
import pandas as pd
from datetime import datetime
from .core.scrapers.models import Property, ListingType, Agent
@@ -10,6 +11,7 @@ ordered_properties = [
"status",
"text",
"style",
"full_street_line",
"street",
"unit",
"city",
@@ -40,6 +42,9 @@ ordered_properties = [
"agent",
"agent_email",
"agent_phones",
"broker",
"broker_phone",
"broker_website",
"nearby_schools",
"primary_photo",
"alt_photos",
@@ -52,6 +57,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data:
address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city
@@ -65,13 +71,20 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["agent_email"] = agents[0].email
prop_data["agent_phones"] = agents[0].phones
if "brokers" in prop_data:
brokers = prop_data["brokers"]
if brokers:
prop_data["broker"] = brokers[0].name
prop_data["broker_phone"] = brokers[0].phone
prop_data["broker_website"] = brokers[0].website
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
description = result.description
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.21"
version = "0.3.29"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -10,7 +10,7 @@ readme = "README.md"
homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies]
python = ">=3.10,<3.13"
python = ">=3.9,<3.13"
requests = "^2.31.0"
pandas = "^2.1.1"

View File

@@ -4,7 +4,7 @@ from homeharvest import scrape_property
def test_realtor_pending_or_contingent():
pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending")
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale")
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale", exclude_pending=True)
assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
assert len(pending_or_contingent_result) != len(regular_result)
@@ -155,4 +155,23 @@ def test_realtor_without_extra_details():
),
]
assert results[0] != results[1]
assert not results[0].equals(results[1])
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0
def test_exclude_pending():
results = scrape_property(
location="33567",
listing_type="pending",
exclude_pending=True,
)
assert results is not None and len(results) > 0