HomeHarvest/homeharvest/core/scrapers/__init__.py

106 lines
3.4 KiB
Python
Raw Normal View History

from __future__ import annotations
2023-09-15 15:17:37 -07:00
from dataclasses import dataclass
import requests
2024-04-30 13:29:54 -07:00
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
2024-04-16 12:55:44 -07:00
import uuid
from ...exceptions import AuthenticationError
2023-09-17 16:30:37 -07:00
from .models import Property, ListingType, SiteName
import json
2023-09-15 15:17:37 -07:00
@dataclass
class ScraperInput:
location: str
listing_type: ListingType
2023-10-02 13:58:47 -07:00
radius: float | None = None
mls_only: bool | None = False
2023-09-19 13:43:24 -07:00
proxy: str | None = None
last_x_days: int | None = None
2023-11-03 16:35:41 -07:00
date_from: str | None = None
date_to: str | None = None
foreclosure: bool | None = False
extra_property_data: bool | None = True
2024-05-31 22:17:29 -07:00
exclude_pending: bool | None = False
2023-09-15 15:17:37 -07:00
class Scraper:
2024-04-30 13:29:54 -07:00
session = None
def __init__(
self,
scraper_input: ScraperInput,
):
2023-09-15 15:17:37 -07:00
self.location = scraper_input.location
2023-09-18 08:16:59 -07:00
self.listing_type = scraper_input.listing_type
2024-04-30 13:29:54 -07:00
if not self.session:
Scraper.session = requests.Session()
retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
2024-04-16 12:55:44 -07:00
{
"auth": f"Bearer {self.get_access_token()}",
"apollographql-client-name": "com.move.Realtor-apollo-ios",
}
)
2023-09-28 18:40:16 -07:00
2023-09-19 14:05:14 -07:00
if scraper_input.proxy:
proxy_url = scraper_input.proxy
2023-09-19 19:13:20 -07:00
proxies = {"http": proxy_url, "https": proxy_url}
2023-09-19 14:05:14 -07:00
self.session.proxies.update(proxies)
2023-09-28 18:40:16 -07:00
2023-09-17 16:30:37 -07:00
self.listing_type = scraper_input.listing_type
2023-10-02 13:58:47 -07:00
self.radius = scraper_input.radius
self.last_x_days = scraper_input.last_x_days
self.mls_only = scraper_input.mls_only
2023-11-03 16:35:41 -07:00
self.date_from = scraper_input.date_from
self.date_to = scraper_input.date_to
2024-03-03 09:45:28 -08:00
self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data
2024-05-31 22:17:29 -07:00
self.exclude_pending = scraper_input.exclude_pending
2023-09-15 15:17:37 -07:00
2024-04-16 12:55:44 -07:00
def search(self) -> list[Property]: ...
2023-09-15 15:42:47 -07:00
@staticmethod
2024-04-16 12:55:44 -07:00
def _parse_home(home) -> Property: ...
2023-09-15 20:58:54 -07:00
2024-04-16 12:55:44 -07:00
def handle_location(self): ...
@staticmethod
def get_access_token():
device_id = str(uuid.uuid4()).upper()
response = requests.post(
"https://graph.realtor.com/auth/token",
headers={
'Host': 'graph.realtor.com',
'Accept': '*/*',
'Content-Type': 'Application/json',
'X-Client-ID': 'rdc_mobile_native,iphone',
'X-Visitor-ID': device_id,
'X-Client-Version': '24.21.23.679885',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0',
},
data=json.dumps({
"grant_type": "device_mobile",
"device_id": device_id,
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone"
}))
2024-04-16 12:55:44 -07:00
data = response.json()
if not (access_token := data.get("access_token")):
raise AuthenticationError(
"Failed to get access token, use a proxy/vpn or wait a moment and try again.",
response=response
)
2024-04-30 13:29:54 -07:00
return access_token