mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-04 19:44:29 -08:00
🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
189 lines
6.7 KiB
Python
189 lines
6.7 KiB
Python
from __future__ import annotations
|
|
from typing import Union
|
|
|
|
import requests
|
|
from requests.adapters import HTTPAdapter
|
|
from urllib3.util.retry import Retry
|
|
import uuid
|
|
from ...exceptions import AuthenticationError
|
|
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
|
|
import json
|
|
from pydantic import BaseModel
|
|
|
|
|
|
class ScraperInput(BaseModel):
|
|
location: str
|
|
listing_type: ListingType | list[ListingType] | None
|
|
property_type: list[SearchPropertyType] | None = None
|
|
radius: float | None = None
|
|
mls_only: bool | None = False
|
|
proxy: str | None = None
|
|
last_x_days: int | None = None
|
|
date_from: str | None = None
|
|
date_to: str | None = None
|
|
date_from_precision: str | None = None # "day" or "hour"
|
|
date_to_precision: str | None = None # "day" or "hour"
|
|
foreclosure: bool | None = False
|
|
extra_property_data: bool | None = True
|
|
exclude_pending: bool | None = False
|
|
limit: int = 10000
|
|
offset: int = 0
|
|
return_type: ReturnType = ReturnType.pandas
|
|
|
|
# New date/time filtering parameters
|
|
past_hours: int | None = None
|
|
|
|
# New last_update_date filtering parameters
|
|
updated_since: str | None = None
|
|
updated_in_past_hours: int | None = None
|
|
|
|
# New property filtering parameters
|
|
beds_min: int | None = None
|
|
beds_max: int | None = None
|
|
baths_min: float | None = None
|
|
baths_max: float | None = None
|
|
sqft_min: int | None = None
|
|
sqft_max: int | None = None
|
|
price_min: int | None = None
|
|
price_max: int | None = None
|
|
lot_sqft_min: int | None = None
|
|
lot_sqft_max: int | None = None
|
|
year_built_min: int | None = None
|
|
year_built_max: int | None = None
|
|
|
|
# New sorting parameters
|
|
sort_by: str | None = None
|
|
sort_direction: str = "desc"
|
|
|
|
# Pagination control
|
|
parallel: bool = True
|
|
|
|
|
|
class Scraper:
|
|
session = None
|
|
|
|
def __init__(
|
|
self,
|
|
scraper_input: ScraperInput,
|
|
):
|
|
self.location = scraper_input.location
|
|
self.listing_type = scraper_input.listing_type
|
|
self.property_type = scraper_input.property_type
|
|
|
|
if not self.session:
|
|
Scraper.session = requests.Session()
|
|
retries = Retry(
|
|
total=3, backoff_factor=4, status_forcelist=[429], allowed_methods=frozenset(["GET", "POST"])
|
|
)
|
|
|
|
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
|
|
Scraper.session.mount("http://", adapter)
|
|
Scraper.session.mount("https://", adapter)
|
|
Scraper.session.headers.update(
|
|
{
|
|
'Content-Type': 'application/json',
|
|
'Accept': '*/*',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Cache-Control': 'no-cache',
|
|
'Pragma': 'no-cache',
|
|
'rdc-client-name': 'rdc-home',
|
|
'rdc-client-version': '2.68.0',
|
|
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"macOS"',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'cors',
|
|
'sec-fetch-site': 'same-site',
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
|
}
|
|
)
|
|
|
|
self.proxy = scraper_input.proxy
|
|
if self.proxy:
|
|
proxies = {"http": self.proxy, "https": self.proxy}
|
|
self.session.proxies.update(proxies)
|
|
|
|
self.listing_type = scraper_input.listing_type
|
|
self.radius = scraper_input.radius
|
|
self.last_x_days = scraper_input.last_x_days
|
|
self.mls_only = scraper_input.mls_only
|
|
self.date_from = scraper_input.date_from
|
|
self.date_to = scraper_input.date_to
|
|
self.date_from_precision = scraper_input.date_from_precision
|
|
self.date_to_precision = scraper_input.date_to_precision
|
|
self.foreclosure = scraper_input.foreclosure
|
|
self.extra_property_data = scraper_input.extra_property_data
|
|
self.exclude_pending = scraper_input.exclude_pending
|
|
self.limit = scraper_input.limit
|
|
self.offset = scraper_input.offset
|
|
self.return_type = scraper_input.return_type
|
|
|
|
# New date/time filtering
|
|
self.past_hours = scraper_input.past_hours
|
|
|
|
# New last_update_date filtering
|
|
self.updated_since = scraper_input.updated_since
|
|
self.updated_in_past_hours = scraper_input.updated_in_past_hours
|
|
|
|
# New property filtering
|
|
self.beds_min = scraper_input.beds_min
|
|
self.beds_max = scraper_input.beds_max
|
|
self.baths_min = scraper_input.baths_min
|
|
self.baths_max = scraper_input.baths_max
|
|
self.sqft_min = scraper_input.sqft_min
|
|
self.sqft_max = scraper_input.sqft_max
|
|
self.price_min = scraper_input.price_min
|
|
self.price_max = scraper_input.price_max
|
|
self.lot_sqft_min = scraper_input.lot_sqft_min
|
|
self.lot_sqft_max = scraper_input.lot_sqft_max
|
|
self.year_built_min = scraper_input.year_built_min
|
|
self.year_built_max = scraper_input.year_built_max
|
|
|
|
# New sorting
|
|
self.sort_by = scraper_input.sort_by
|
|
self.sort_direction = scraper_input.sort_direction
|
|
|
|
# Pagination control
|
|
self.parallel = scraper_input.parallel
|
|
|
|
def search(self) -> list[Union[Property | dict]]: ...
|
|
|
|
@staticmethod
|
|
def _parse_home(home) -> Property: ...
|
|
|
|
def handle_location(self): ...
|
|
|
|
@staticmethod
|
|
def get_access_token():
|
|
device_id = str(uuid.uuid4()).upper()
|
|
|
|
response = requests.post(
|
|
"https://graph.realtor.com/auth/token",
|
|
headers={
|
|
"Host": "graph.realtor.com",
|
|
"Accept": "*/*",
|
|
"Content-Type": "Application/json",
|
|
"X-Client-ID": "rdc_mobile_native,iphone",
|
|
"X-Visitor-ID": device_id,
|
|
"X-Client-Version": "24.21.23.679885",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"User-Agent": "Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0",
|
|
},
|
|
data=json.dumps(
|
|
{
|
|
"grant_type": "device_mobile",
|
|
"device_id": device_id,
|
|
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone",
|
|
}
|
|
),
|
|
)
|
|
|
|
data = response.json()
|
|
|
|
if not (access_token := data.get("access_token")):
|
|
raise AuthenticationError(
|
|
"Failed to get access token, use a proxy/vpn or wait a moment and try again.", response=response
|
|
)
|
|
|
|
return access_token
|