Refactor scraper to use direct requests and bump to 0.8.18

- Replace session-based approach with direct requests calls
- Move headers to module-level DEFAULT_HEADERS constant
- Temporarily disable extra_property_data feature

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
zacharyhampton
2025-12-26 00:29:53 -07:00
parent 129ab37dff
commit 8a6ac96db4
3 changed files with 32 additions and 45 deletions

View File

@@ -2,8 +2,6 @@ from __future__ import annotations
from typing import Union
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid
from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
@@ -11,6 +9,27 @@ import json
from pydantic import BaseModel
DEFAULT_HEADERS = {
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Origin': 'https://www.realtor.com',
'Pragma': 'no-cache',
'Referer': 'https://www.realtor.com/',
'rdc-client-name': 'RDC_WEB_SRP_FS_PAGE',
'rdc-client-version': '3.0.2515',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
'x-is-bot': 'false',
}
class ScraperInput(BaseModel):
location: str
listing_type: ListingType | list[ListingType] | None
@@ -60,8 +79,6 @@ class ScraperInput(BaseModel):
class Scraper:
session = None
def __init__(
self,
scraper_input: ScraperInput,
@@ -69,42 +86,8 @@ class Scraper:
self.location = scraper_input.location
self.listing_type = scraper_input.listing_type
self.property_type = scraper_input.property_type
if not self.session:
Scraper.session = requests.Session()
retries = Retry(
total=3, backoff_factor=4, status_forcelist=[429], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Origin': 'https://www.realtor.com',
'Pragma': 'no-cache',
'Referer': 'https://www.realtor.com/',
'rdc-client-name': 'RDC_WEB_SRP_FS_PAGE',
'rdc-client-version': '3.0.2515',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
'x-is-bot': 'false',
}
)
self.proxy = scraper_input.proxy
if self.proxy:
proxies = {"http": self.proxy, "https": self.proxy}
self.session.proxies.update(proxies)
self.proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
self.listing_type = scraper_input.listing_type
self.radius = scraper_input.radius
@@ -115,7 +98,7 @@ class Scraper:
self.date_from_precision = scraper_input.date_from_precision
self.date_to_precision = scraper_input.date_to_precision
self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data
self.extra_property_data = False # TODO: temporarily disabled
self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit
self.offset = scraper_input.offset

View File

@@ -8,9 +8,8 @@ This module implements the scraper for realtor.com
from __future__ import annotations
import json
import re
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from json import JSONDecodeError
from typing import Dict, Union
@@ -22,7 +21,7 @@ from tenacity import (
stop_after_attempt,
)
from .. import Scraper
from .. import Scraper, DEFAULT_HEADERS
from ....exceptions import AuthenticationError
from ..models import (
Property,
@@ -69,7 +68,12 @@ class RealtorScraper(Scraper):
"variables": variables,
}
response = self.session.post(self.SEARCH_GQL_URL, data=json.dumps(payload, separators=(',', ':')))
response = requests.post(
self.SEARCH_GQL_URL,
headers=DEFAULT_HEADERS,
data=json.dumps(payload, separators=(',', ':')),
proxies=self.proxies
)
if response.status_code == 403:
if not self.proxy:

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.8.17"
version = "0.8.18"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest"