From 8a6ac96db419b56a18d295935217649039bcdd0a Mon Sep 17 00:00:00 2001 From: zacharyhampton Date: Fri, 26 Dec 2025 00:29:53 -0700 Subject: [PATCH] Refactor scraper to use direct requests and bump to 0.8.18 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace session-based approach with direct requests calls - Move headers to module-level DEFAULT_HEADERS constant - Temporarily disable extra_property_data feature 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- homeharvest/core/scrapers/__init__.py | 63 +++++++------------ homeharvest/core/scrapers/realtor/__init__.py | 12 ++-- pyproject.toml | 2 +- 3 files changed, 32 insertions(+), 45 deletions(-) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index ba79253..5b71d98 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -2,8 +2,6 @@ from __future__ import annotations from typing import Union import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry import uuid from ...exceptions import AuthenticationError from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType @@ -11,6 +9,27 @@ import json from pydantic import BaseModel +DEFAULT_HEADERS = { + 'Content-Type': 'application/json', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Cache-Control': 'no-cache', + 'Origin': 'https://www.realtor.com', + 'Pragma': 'no-cache', + 'Referer': 'https://www.realtor.com/', + 'rdc-client-name': 'RDC_WEB_SRP_FS_PAGE', + 'rdc-client-version': '3.0.2515', + 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-site', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + 'x-is-bot': 'false', +} + + class ScraperInput(BaseModel): location: str listing_type: ListingType | list[ListingType] | None @@ -60,8 +79,6 @@ class ScraperInput(BaseModel): class Scraper: - session = None - def __init__( self, scraper_input: ScraperInput, @@ -69,42 +86,8 @@ class Scraper: self.location = scraper_input.location self.listing_type = scraper_input.listing_type self.property_type = scraper_input.property_type - - if not self.session: - Scraper.session = requests.Session() - retries = Retry( - total=3, backoff_factor=4, status_forcelist=[429], allowed_methods=frozenset(["GET", "POST"]) - ) - - adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20) - Scraper.session.mount("http://", adapter) - Scraper.session.mount("https://", adapter) - Scraper.session.headers.update( - { - 'Content-Type': 'application/json', - 'Accept': '*/*', - 'Accept-Language': 'en-US,en;q=0.9', - 'Cache-Control': 'no-cache', - 'Origin': 'https://www.realtor.com', - 'Pragma': 'no-cache', - 'Referer': 'https://www.realtor.com/', - 'rdc-client-name': 'RDC_WEB_SRP_FS_PAGE', - 'rdc-client-version': '3.0.2515', - 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"macOS"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-site', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', - 'x-is-bot': 'false', - } - ) - self.proxy = scraper_input.proxy - if self.proxy: - proxies = {"http": self.proxy, "https": self.proxy} - self.session.proxies.update(proxies) + self.proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None self.listing_type = scraper_input.listing_type self.radius = scraper_input.radius @@ -115,7 +98,7 @@ class Scraper: self.date_from_precision = scraper_input.date_from_precision self.date_to_precision = scraper_input.date_to_precision self.foreclosure = scraper_input.foreclosure - self.extra_property_data = scraper_input.extra_property_data + self.extra_property_data = False # TODO: temporarily disabled self.exclude_pending = scraper_input.exclude_pending self.limit = scraper_input.limit self.offset = scraper_input.offset diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index e73ac49..1342194 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -8,9 +8,8 @@ This module implements the scraper for realtor.com from __future__ import annotations import json -import re +import requests from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime from json import JSONDecodeError from typing import Dict, Union @@ -22,7 +21,7 @@ from tenacity import ( stop_after_attempt, ) -from .. import Scraper +from .. import Scraper, DEFAULT_HEADERS from ....exceptions import AuthenticationError from ..models import ( Property, @@ -69,7 +68,12 @@ class RealtorScraper(Scraper): "variables": variables, } - response = self.session.post(self.SEARCH_GQL_URL, data=json.dumps(payload, separators=(',', ':'))) + response = requests.post( + self.SEARCH_GQL_URL, + headers=DEFAULT_HEADERS, + data=json.dumps(payload, separators=(',', ':')), + proxies=self.proxies + ) if response.status_code == 403: if not self.proxy: diff --git a/pyproject.toml b/pyproject.toml index e387a93..4e5234f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.8.17" +version = "0.8.18" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest"