HomeHarvest/homeharvest/core/scrapers/__init__.py

from __future__ import annotations
from typing import Union

import requests
import uuid
from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
import json
from pydantic import BaseModel


DEFAULT_HEADERS = {
    'Content-Type': 'application/json',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cache-Control': 'no-cache',
    'Origin': 'https://www.realtor.com',
    'Pragma': 'no-cache',
    'Referer': 'https://www.realtor.com/',
    'rdc-client-name': 'RDC_WEB_SRP_FS_PAGE',
    'rdc-client-version': '3.0.2515',
    'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
    'x-is-bot': 'false',
}


class ScraperInput(BaseModel):
    location: str
    listing_type: ListingType | list[ListingType] | None
    property_type: list[SearchPropertyType] | None = None
    radius: float | None = None
    mls_only: bool | None = False
    proxy: str | None = None
    last_x_days: int | None = None
    date_from: str | None = None
    date_to: str | None = None
    date_from_precision: str | None = None  # "day" or "hour"
    date_to_precision: str | None = None    # "day" or "hour"
    foreclosure: bool | None = False
    extra_property_data: bool | None = True
    exclude_pending: bool | None = False
    limit: int = 10000
    offset: int = 0
    return_type: ReturnType = ReturnType.pandas

    # New date/time filtering parameters
    past_hours: int | None = None

    # New last_update_date filtering parameters
    updated_since: str | None = None
    updated_in_past_hours: int | None = None

    # New property filtering parameters
    beds_min: int | None = None
    beds_max: int | None = None
    baths_min: float | None = None
    baths_max: float | None = None
    sqft_min: int | None = None
    sqft_max: int | None = None
    price_min: int | None = None
    price_max: int | None = None
    lot_sqft_min: int | None = None
    lot_sqft_max: int | None = None
    year_built_min: int | None = None
    year_built_max: int | None = None

    # New sorting parameters
    sort_by: str | None = None
    sort_direction: str = "desc"

    # Pagination control
    parallel: bool = True


class Scraper:
    def __init__(
        self,
        scraper_input: ScraperInput,
    ):
        self.location = scraper_input.location
        self.listing_type = scraper_input.listing_type
        self.property_type = scraper_input.property_type
        self.proxy = scraper_input.proxy
        self.proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None

        self.listing_type = scraper_input.listing_type
        self.radius = scraper_input.radius
        self.last_x_days = scraper_input.last_x_days
        self.mls_only = scraper_input.mls_only
        self.date_from = scraper_input.date_from
        self.date_to = scraper_input.date_to
        self.date_from_precision = scraper_input.date_from_precision
        self.date_to_precision = scraper_input.date_to_precision
        self.foreclosure = scraper_input.foreclosure
        self.extra_property_data = False  # TODO: temporarily disabled
        self.exclude_pending = scraper_input.exclude_pending
        self.limit = scraper_input.limit
        self.offset = scraper_input.offset
        self.return_type = scraper_input.return_type

        # New date/time filtering
        self.past_hours = scraper_input.past_hours

        # New last_update_date filtering
        self.updated_since = scraper_input.updated_since
        self.updated_in_past_hours = scraper_input.updated_in_past_hours

        # New property filtering
        self.beds_min = scraper_input.beds_min
        self.beds_max = scraper_input.beds_max
        self.baths_min = scraper_input.baths_min
        self.baths_max = scraper_input.baths_max
        self.sqft_min = scraper_input.sqft_min
        self.sqft_max = scraper_input.sqft_max
        self.price_min = scraper_input.price_min
        self.price_max = scraper_input.price_max
        self.lot_sqft_min = scraper_input.lot_sqft_min
        self.lot_sqft_max = scraper_input.lot_sqft_max
        self.year_built_min = scraper_input.year_built_min
        self.year_built_max = scraper_input.year_built_max

        # New sorting
        self.sort_by = scraper_input.sort_by
        self.sort_direction = scraper_input.sort_direction

        # Pagination control
        self.parallel = scraper_input.parallel

    def search(self) -> list[Union[Property | dict]]: ...

    @staticmethod
    def _parse_home(home) -> Property: ...

    def handle_location(self): ...

    @staticmethod
    def get_access_token():
        device_id = str(uuid.uuid4()).upper()

        response = requests.post(
            "https://graph.realtor.com/auth/token",
            headers={
                "Host": "graph.realtor.com",
                "Accept": "*/*",
                "Content-Type": "Application/json",
                "X-Client-ID": "rdc_mobile_native,iphone",
                "X-Visitor-ID": device_id,
                "X-Client-Version": "24.21.23.679885",
                "Accept-Language": "en-US,en;q=0.9",
                "User-Agent": "Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0",
            },
            data=json.dumps(
                {
                    "grant_type": "device_mobile",
                    "device_id": device_id,
                    "client_app_id": "rdc_mobile_native,24.21.23.679885,iphone",
                }
            ),
        )

        data = response.json()

        if not (access_token := data.get("access_token")):
            raise AuthenticationError(
                "Failed to get access token, use a proxy/vpn or wait a moment and try again.", response=response
            )

        return access_token