Compare commits

...

3 Commits

Author SHA1 Message Date
zacharyhampton
8a6ac96db4 Refactor scraper to use direct requests and bump to 0.8.18
- Replace session-based approach with direct requests calls
- Move headers to module-level DEFAULT_HEADERS constant
- Temporarily disable extra_property_data feature

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 00:29:53 -07:00
zacharyhampton
129ab37dff Version bump to 0.8.17
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 19:11:10 -07:00
zacharyhampton
9a0cac650e Version bump to 0.8.16 2025-12-21 16:22:03 -07:00
4 changed files with 57 additions and 79 deletions

View File

@@ -2,8 +2,6 @@ from __future__ import annotations
from typing import Union from typing import Union
import requests import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType
@@ -11,6 +9,27 @@ import json
from pydantic import BaseModel from pydantic import BaseModel
DEFAULT_HEADERS = {
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Origin': 'https://www.realtor.com',
'Pragma': 'no-cache',
'Referer': 'https://www.realtor.com/',
'rdc-client-name': 'RDC_WEB_SRP_FS_PAGE',
'rdc-client-version': '3.0.2515',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
'x-is-bot': 'false',
}
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
location: str location: str
listing_type: ListingType | list[ListingType] | None listing_type: ListingType | list[ListingType] | None
@@ -60,8 +79,6 @@ class ScraperInput(BaseModel):
class Scraper: class Scraper:
session = None
def __init__( def __init__(
self, self,
scraper_input: ScraperInput, scraper_input: ScraperInput,
@@ -69,39 +86,8 @@ class Scraper:
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.property_type = scraper_input.property_type self.property_type = scraper_input.property_type
if not self.session:
Scraper.session = requests.Session()
retries = Retry(
total=3, backoff_factor=4, status_forcelist=[429], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=20)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'rdc-client-name': 'rdc-home',
'rdc-client-version': '2.68.0',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
}
)
self.proxy = scraper_input.proxy self.proxy = scraper_input.proxy
if self.proxy: self.proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
proxies = {"http": self.proxy, "https": self.proxy}
self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.radius = scraper_input.radius self.radius = scraper_input.radius
@@ -112,7 +98,7 @@ class Scraper:
self.date_from_precision = scraper_input.date_from_precision self.date_from_precision = scraper_input.date_from_precision
self.date_to_precision = scraper_input.date_to_precision self.date_to_precision = scraper_input.date_to_precision
self.foreclosure = scraper_input.foreclosure self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = False # TODO: temporarily disabled
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit self.limit = scraper_input.limit
self.offset = scraper_input.offset self.offset = scraper_input.offset

View File

@@ -8,9 +8,8 @@ This module implements the scraper for realtor.com
from __future__ import annotations from __future__ import annotations
import json import json
import re import requests
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
from typing import Dict, Union from typing import Dict, Union
@@ -22,14 +21,14 @@ from tenacity import (
stop_after_attempt, stop_after_attempt,
) )
from .. import Scraper from .. import Scraper, DEFAULT_HEADERS
from ....exceptions import AuthenticationError from ....exceptions import AuthenticationError
from ..models import ( from ..models import (
Property, Property,
ListingType, ListingType,
ReturnType ReturnType
) )
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT, SEARCH_SUGGESTIONS_QUERY from .queries import GENERAL_RESULTS_QUERY, HOMES_DATA, SEARCH_SUGGESTIONS_QUERY
from .processors import ( from .processors import (
process_property, process_property,
process_extra_property_details, process_extra_property_details,
@@ -69,7 +68,12 @@ class RealtorScraper(Scraper):
"variables": variables, "variables": variables,
} }
response = self.session.post(self.SEARCH_GQL_URL, data=json.dumps(payload, separators=(',', ':'))) response = requests.post(
self.SEARCH_GQL_URL,
headers=DEFAULT_HEADERS,
data=json.dumps(payload, separators=(',', ':')),
proxies=self.proxies
)
if response.status_code == 403: if response.status_code == 403:
if not self.proxy: if not self.proxy:
@@ -126,6 +130,11 @@ class RealtorScraper(Scraper):
} }
if geo.get("area_type") == "address": if geo.get("area_type") == "address":
# Try to get mpr_id directly from API response first
if geo.get("mpr_id"):
result["mpr_id"] = geo.get("mpr_id")
else:
# Fallback: extract from _id field if it has addr: prefix
geo_id = geo.get("_id", "") geo_id = geo.get("_id", "")
if geo_id.startswith("addr:"): if geo_id.startswith("addr:"):
result["mpr_id"] = geo_id.replace("addr:", "") result["mpr_id"] = geo_id.replace("addr:", "")
@@ -166,13 +175,10 @@ class RealtorScraper(Scraper):
def handle_home(self, property_id: str) -> list[Property]: def handle_home(self, property_id: str) -> list[Property]:
"""Fetch single home with proper error handling.""" """Fetch single home with proper error handling."""
query = ( query = (
"""%s """query GetHomeDetails($property_id: ID!) {
query GetHomeDetails($property_id: ID!) { home(property_id: $property_id) %s
home(property_id: $property_id) {
...HomeDetailsFragment
}
}""" }"""
% HOME_FRAGMENT % HOMES_DATA
) )
variables = {"property_id": property_id} variables = {"property_id": property_id}
@@ -419,9 +425,7 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s ) %s
} }""" % (
%s
%s""" % (
is_foreclosure, is_foreclosure,
status_param, status_param,
date_param, date_param,
@@ -430,13 +434,11 @@ class RealtorScraper(Scraper):
pending_or_contingent_param, pending_or_contingent_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
SEARCH_RESULTS_FRAGMENT,
LISTING_PHOTOS_FRAGMENT,
) )
elif search_type == "area": #: general search, came from a general location elif search_type == "area": #: general search, came from a general location
query = """query GetHomeSearch( query = """query GetHomeSearch(
$search_location: SearchLocation, $search_location: SearchLocation,
$offset: Int, $offset: Int
) { ) {
homeSearch: home_search( homeSearch: home_search(
query: { query: {
@@ -453,9 +455,7 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s ) %s
} }""" % (
%s
%s""" % (
is_foreclosure, is_foreclosure,
status_param, status_param,
date_param, date_param,
@@ -465,8 +465,6 @@ class RealtorScraper(Scraper):
bucket_param, bucket_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
SEARCH_RESULTS_FRAGMENT,
LISTING_PHOTOS_FRAGMENT,
) )
else: #: general search, came from an address else: #: general search, came from an address
query = ( query = (
@@ -481,10 +479,8 @@ class RealtorScraper(Scraper):
limit: 1 limit: 1
offset: $offset offset: $offset
) %s ) %s
} }"""
%s % GENERAL_RESULTS_QUERY
%s"""
% (GENERAL_RESULTS_QUERY, SEARCH_RESULTS_FRAGMENT, LISTING_PHOTOS_FRAGMENT)
) )
response_json = self._graphql_post(query, variables, "GetHomeSearch") response_json = self._graphql_post(query, variables, "GetHomeSearch")
@@ -1123,12 +1119,10 @@ class RealtorScraper(Scraper):
property_ids = list(set(property_ids)) property_ids = list(set(property_ids))
fragments = "\n".join( fragments = "\n".join(
f'home_{property_id}: home(property_id: {property_id}) {{ ...SearchFragment }}' f'home_{property_id}: home(property_id: {property_id}) {HOMES_DATA}'
for property_id in property_ids for property_id in property_ids
) )
query = f"""{HOME_FRAGMENT} query = f"""query GetHome {{
query GetHome {{
{fragments} {fragments}
}}""" }}"""

View File

@@ -1,5 +1,5 @@
SEARCH_RESULTS_FRAGMENT = """ SEARCH_RESULTS_FRAGMENT = """
fragment SearchFragment on SearchHome { fragment PropertyResult on SearchHome {
__typename __typename
pending_date pending_date
listing_id listing_id
@@ -371,7 +371,7 @@ _SEARCH_HOMES_DATA_BASE = """{
HOME_FRAGMENT = """ HOME_FRAGMENT = """
fragment SearchFragment on Home { fragment PropertyResult on Home {
__typename __typename
pending_date pending_date
listing_id listing_id
@@ -689,12 +689,8 @@ GENERAL_RESULTS_QUERY = """{
__typename __typename
count count
total total
results { results %s
__typename }""" % SEARCH_HOMES_DATA
...SearchFragment
...ListingPhotosFragment
}
}"""
LISTING_PHOTOS_FRAGMENT = """ LISTING_PHOTOS_FRAGMENT = """
fragment ListingPhotosFragment on SearchHome { fragment ListingPhotosFragment on SearchHome {
@@ -721,6 +717,7 @@ SEARCH_SUGGESTIONS_QUERY = """query Search_suggestions($searchInput: SearchSugge
geo { geo {
_id _id
_score _score
mpr_id
area_type area_type
city city
state_code state_code
@@ -764,6 +761,7 @@ SEARCH_SUGGESTIONS_QUERY = """query Search_suggestions($searchInput: SearchSugge
geo { geo {
_id _id
_score _score
mpr_id
area_type area_type
city city
state_code state_code

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.8.15" version = "0.8.18"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"