Compare commits

...

7 Commits

Author SHA1 Message Date
Zachary Hampton
93e6778a48 - exclude_pending parameter 2024-05-31 22:17:29 -07:00
Zachary Hampton
ec036bb989 - optimizations & updated realtor headers 2024-05-20 12:13:30 -07:00
Zachary Hampton
aacd168545 - alt photos bug fix 2024-05-18 17:47:55 -07:00
Zachary Hampton
0d70007000 - alt photos bug fix 2024-05-16 23:04:07 -07:00
Zachary Hampton
018d3fbac4 - Python 3.9 support (tested) (could potentially work for lower versions, but I have not validated such) 2024-05-14 19:13:04 -07:00
Zachary Hampton
803fd618e9 - data cleaning & CONDOP bug fixes 2024-05-12 21:12:12 -07:00
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
8 changed files with 79 additions and 27 deletions

View File

@@ -21,7 +21,7 @@
```bash ```bash
pip install -U homeharvest pip install -U homeharvest
``` ```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
## Usage ## Usage
@@ -90,9 +90,11 @@ Optional
├── foreclosure (True/False): If set, fetches only foreclosures ├── foreclosure (True/False): If set, fetches only foreclosures
── proxy (string): In format 'http://user:pass@host:port' ── proxy (string): In format 'http://user:pass@host:port'
── extra_property_data (bool): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) ── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
``` ```
### Property Schema ### Property Schema
@@ -142,6 +144,11 @@ Property
│ ├── agent │ ├── agent
│ ├── agent_email │ ├── agent_email
│ └── agent_phone │ └── agent_phone
├── Broker Info:
│ ├── broker
│ ├── broker_email
│ └── broker_website
``` ```
### Exceptions ### Exceptions

View File

@@ -17,11 +17,12 @@ def scrape_property(
date_to: str = None, date_to: str = None,
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold) :param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
:param proxy: Proxy to use for scraping :param proxy: Proxy to use for scraping
@@ -29,6 +30,7 @@ def scrape_property(
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28 :param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param foreclosure: If set, fetches only foreclosure listings. :param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
@@ -44,16 +46,17 @@ def scrape_property(
date_to=date_to, date_to=date_to,
foreclosure=foreclosure, foreclosure=foreclosure,
extra_property_data=extra_property_data, extra_property_data=extra_property_data,
exclude_pending=exclude_pending,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
results = site.search() results = site.search()
properties_dfs = [process_result(result) for result in results] properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""}) return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
@@ -5,6 +6,7 @@ from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName
import json
@dataclass @dataclass
@@ -19,6 +21,7 @@ class ScraperInput:
date_to: str | None = None date_to: str | None = None
foreclosure: bool | None = False foreclosure: bool | None = False
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False
class Scraper: class Scraper:
@@ -60,6 +63,7 @@ class Scraper:
self.date_to = scraper_input.date_to self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending
def search(self) -> list[Property]: ... def search(self) -> list[Property]: ...
@@ -70,18 +74,25 @@ class Scraper:
@staticmethod @staticmethod
def get_access_token(): def get_access_token():
url = "https://graph.realtor.com/auth/token" device_id = str(uuid.uuid4()).upper()
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}' response = requests.post(
headers = { "https://graph.realtor.com/auth/token",
"Host": "graph.realtor.com", headers={
"x-client-version": "24.20.4.149916", 'Host': 'graph.realtor.com',
"accept": "*/*", 'Accept': '*/*',
"content-type": "Application/json", 'Content-Type': 'Application/json',
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0", 'X-Client-ID': 'rdc_mobile_native,iphone',
"accept-language": "en-US,en;q=0.9", 'X-Visitor-ID': device_id,
} 'X-Client-Version': '24.21.23.679885',
response = requests.post(url, headers=headers, data=payload) 'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0',
},
data=json.dumps({
"grant_type": "device_mobile",
"device_id": device_id,
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone"
}))
data = response.json() data = response.json()

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Optional from typing import Optional
@@ -36,6 +37,7 @@ class PropertyType(Enum):
CONDO_TOWNHOME = "CONDO_TOWNHOME" CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP" CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO" CONDO = "CONDO"
CONDOP = "CONDOP"
CONDOS = "CONDOS" CONDOS = "CONDOS"
COOP = "COOP" COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX" DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
@@ -52,6 +54,7 @@ class PropertyType(Enum):
@dataclass @dataclass
class Address: class Address:
full_line: str | None = None
street: str | None = None street: str | None = None
unit: str | None = None unit: str | None = None
city: str | None = None city: str | None = None

View File

@@ -4,7 +4,7 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com This module implements the scraper for realtor.com
""" """
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id listing_id
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -165,7 +166,7 @@ class RealtorScraper(Scraper):
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None, longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(property_info, search_type="handle_listing"), address=self._parse_address(property_info, search_type="handle_listing"),
description=Description( description=Description(
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])), alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
style=property_info["basic"].get("type", "").upper(), style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"), beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"), baths_full=property_info["basic"].get("baths_full"),
@@ -236,6 +237,7 @@ class RealtorScraper(Scraper):
stories stories
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -352,6 +354,7 @@ class RealtorScraper(Scraper):
street_number street_number
street_name street_name
street_suffix street_suffix
line
unit unit
city city
state_code state_code
@@ -522,7 +525,7 @@ class RealtorScraper(Scraper):
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
if is_pending and self.listing_type != ListingType.PENDING: if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING):
return return
property_id = result["property_id"] property_id = result["property_id"]
@@ -657,6 +660,8 @@ class RealtorScraper(Scraper):
if not self.extra_property_data: if not self.extra_property_data:
return {} return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) { query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) { home(property_id: $property_id) {
__typename __typename
@@ -765,6 +770,7 @@ class RealtorScraper(Scraper):
address = result["address"] address = result["address"]
return Address( return Address(
full_line=address.get("line"),
street=" ".join( street=" ".join(
part for part in [ part for part in [
address.get("street_number"), address.get("street_number"),
@@ -799,8 +805,8 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")), alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType(style) if style else None, style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType, Agent from .core.scrapers.models import Property, ListingType, Agent
@@ -10,6 +11,7 @@ ordered_properties = [
"status", "status",
"text", "text",
"style", "style",
"full_street_line",
"street", "street",
"unit", "unit",
"city", "city",
@@ -55,6 +57,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data: if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city prop_data["city"] = address_data.city
@@ -81,7 +84,7 @@ def process_result(result: Property) -> pd.DataFrame:
description = result.description description = result.description
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if type(description.style) == str else description.style.value prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.22" version = "0.3.29"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -10,7 +10,7 @@ readme = "README.md"
homeharvest = "homeharvest.cli:main" homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.10,<3.13" python = ">=3.9,<3.13"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.1" pandas = "^2.1.1"

View File

@@ -4,7 +4,7 @@ from homeharvest import scrape_property
def test_realtor_pending_or_contingent(): def test_realtor_pending_or_contingent():
pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending") pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending")
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale") regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale", exclude_pending=True)
assert all([result is not None for result in [pending_or_contingent_result, regular_result]]) assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
assert len(pending_or_contingent_result) != len(regular_result) assert len(pending_or_contingent_result) != len(regular_result)
@@ -155,4 +155,23 @@ def test_realtor_without_extra_details():
), ),
] ]
assert results[0] != results[1] assert not results[0].equals(results[1])
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0
def test_exclude_pending():
results = scrape_property(
location="33567",
listing_type="pending",
exclude_pending=True,
)
assert results is not None and len(results) > 0