Compare commits

...

6 Commits

Author SHA1 Message Date
Zachary Hampton
011680f7d8 - style error bug fix 2024-06-06 15:24:12 -07:00
Zachary Hampton
93e6778a48 - exclude_pending parameter 2024-05-31 22:17:29 -07:00
Zachary Hampton
ec036bb989 - optimizations & updated realtor headers 2024-05-20 12:13:30 -07:00
Zachary Hampton
aacd168545 - alt photos bug fix 2024-05-18 17:47:55 -07:00
Zachary Hampton
0d70007000 - alt photos bug fix 2024-05-16 23:04:07 -07:00
Zachary Hampton
018d3fbac4 - Python 3.9 support (tested) (could potentially work for lower versions, but I have not validated such) 2024-05-14 19:13:04 -07:00
8 changed files with 81 additions and 39 deletions

View File

@@ -21,7 +21,7 @@
```bash
pip install -U homeharvest
```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
_Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
## Usage
@@ -90,9 +90,11 @@ Optional
├── foreclosure (True/False): If set, fetches only foreclosures
── proxy (string): In format 'http://user:pass@host:port'
── proxy (string): In format 'http://user:pass@host:port'
── extra_property_data (bool): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
```
### Property Schema
@@ -142,6 +144,11 @@ Property
│ ├── agent
│ ├── agent_email
│ └── agent_phone
├── Broker Info:
│ ├── broker
│ ├── broker_email
│ └── broker_website
```
### Exceptions

View File

@@ -17,11 +17,12 @@ def scrape_property(
date_to: str = None,
foreclosure: bool = None,
extra_property_data: bool = True,
exclude_pending: bool = False,
) -> pd.DataFrame:
"""
Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold)
:param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs.
:param proxy: Proxy to use for scraping
@@ -29,6 +30,7 @@ def scrape_property(
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
"""
validate_input(listing_type)
validate_dates(date_from, date_to)
@@ -44,17 +46,16 @@ def scrape_property(
date_to=date_to,
foreclosure=foreclosure,
extra_property_data=extra_property_data,
exclude_pending=exclude_pending,
)
site = RealtorScraper(scraper_input)
results = site.search()
properties_dfs = [process_result(result) for result in results]
properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs:
return pd.DataFrame()
properties_dfs = [df for df in properties_dfs if not df.empty]
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
import requests
from requests.adapters import HTTPAdapter
@@ -5,6 +6,7 @@ from urllib3.util.retry import Retry
import uuid
from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName
import json
@dataclass
@@ -19,6 +21,7 @@ class ScraperInput:
date_to: str | None = None
foreclosure: bool | None = False
extra_property_data: bool | None = True
exclude_pending: bool | None = False
class Scraper:
@@ -60,6 +63,7 @@ class Scraper:
self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending
def search(self) -> list[Property]: ...
@@ -70,18 +74,25 @@ class Scraper:
@staticmethod
def get_access_token():
url = "https://graph.realtor.com/auth/token"
device_id = str(uuid.uuid4()).upper()
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}'
headers = {
"Host": "graph.realtor.com",
"x-client-version": "24.20.4.149916",
"accept": "*/*",
"content-type": "Application/json",
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0",
"accept-language": "en-US,en;q=0.9",
}
response = requests.post(url, headers=headers, data=payload)
response = requests.post(
"https://graph.realtor.com/auth/token",
headers={
'Host': 'graph.realtor.com',
'Accept': '*/*',
'Content-Type': 'Application/json',
'X-Client-ID': 'rdc_mobile_native,iphone',
'X-Visitor-ID': device_id,
'X-Client-Version': '24.21.23.679885',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0',
},
data=json.dumps({
"grant_type": "device_mobile",
"device_id": device_id,
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone"
}))
data = response.json()

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from typing import Optional

View File

@@ -4,7 +4,7 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com
"""
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import Dict, Union, Optional
@@ -166,7 +166,7 @@ class RealtorScraper(Scraper):
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(property_info, search_type="handle_listing"),
description=Description(
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])),
alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"),
@@ -525,7 +525,7 @@ class RealtorScraper(Scraper):
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
if is_pending and self.listing_type != ListingType.PENDING:
if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING):
return
property_id = result["property_id"]
@@ -805,7 +805,7 @@ class RealtorScraper(Scraper):
return Description(
primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"),

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
import pandas as pd
from datetime import datetime
from .core.scrapers.models import Property, ListingType, Agent
@@ -82,19 +83,20 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
description = result.description
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half
prop_data["sqft"] = description.sqft
prop_data["lot_sqft"] = description.lot_sqft
prop_data["sold_price"] = description.sold_price
prop_data["year_built"] = description.year_built
prop_data["parking_garage"] = description.garage
prop_data["stories"] = description.stories
prop_data["text"] = description.text
if description:
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if isinstance(description.style, str) else description.style.value if description.style else None
prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half
prop_data["sqft"] = description.sqft
prop_data["lot_sqft"] = description.lot_sqft
prop_data["sold_price"] = description.sold_price
prop_data["year_built"] = description.year_built
prop_data["parking_garage"] = description.garage
prop_data["stories"] = description.stories
prop_data["text"] = description.text
properties_df = pd.DataFrame([prop_data])
properties_df = properties_df.reindex(columns=ordered_properties)

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.24"
version = "0.3.30"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -10,7 +10,7 @@ readme = "README.md"
homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies]
python = ">=3.10,<3.13"
python = ">=3.9,<3.13"
requests = "^2.31.0"
pandas = "^2.1.1"

View File

@@ -4,7 +4,7 @@ from homeharvest import scrape_property
def test_realtor_pending_or_contingent():
pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending")
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale")
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale", exclude_pending=True)
assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
assert len(pending_or_contingent_result) != len(regular_result)
@@ -155,7 +155,7 @@ def test_realtor_without_extra_details():
),
]
assert results[0] != results[1]
assert not results[0].equals(results[1])
def test_pr_zip_code():
@@ -165,3 +165,23 @@ def test_pr_zip_code():
)
assert results is not None and len(results) > 0
def test_exclude_pending():
results = scrape_property(
location="33567",
listing_type="pending",
exclude_pending=True,
)
assert results is not None and len(results) > 0
def test_style_value_error():
results = scrape_property(
location="Alaska, AK",
listing_type="sold",
extra_property_data=False,
)
assert results is not None and len(results) > 0