mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 03:54:29 -08:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
93e6778a48 | ||
|
|
ec036bb989 | ||
|
|
aacd168545 | ||
|
|
0d70007000 | ||
|
|
018d3fbac4 | ||
|
|
803fd618e9 |
13
README.md
13
README.md
@@ -21,7 +21,7 @@
|
|||||||
```bash
|
```bash
|
||||||
pip install -U homeharvest
|
pip install -U homeharvest
|
||||||
```
|
```
|
||||||
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
_Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@@ -90,9 +90,11 @@ Optional
|
|||||||
│
|
│
|
||||||
├── foreclosure (True/False): If set, fetches only foreclosures
|
├── foreclosure (True/False): If set, fetches only foreclosures
|
||||||
│
|
│
|
||||||
└── proxy (string): In format 'http://user:pass@host:port'
|
├── proxy (string): In format 'http://user:pass@host:port'
|
||||||
│
|
│
|
||||||
└── extra_property_data (bool): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
||||||
|
│
|
||||||
|
└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
|
||||||
```
|
```
|
||||||
|
|
||||||
### Property Schema
|
### Property Schema
|
||||||
@@ -142,6 +144,11 @@ Property
|
|||||||
│ ├── agent
|
│ ├── agent
|
||||||
│ ├── agent_email
|
│ ├── agent_email
|
||||||
│ └── agent_phone
|
│ └── agent_phone
|
||||||
|
|
||||||
|
├── Broker Info:
|
||||||
|
│ ├── broker
|
||||||
|
│ ├── broker_email
|
||||||
|
│ └── broker_website
|
||||||
```
|
```
|
||||||
|
|
||||||
### Exceptions
|
### Exceptions
|
||||||
|
|||||||
@@ -17,11 +17,12 @@ def scrape_property(
|
|||||||
date_to: str = None,
|
date_to: str = None,
|
||||||
foreclosure: bool = None,
|
foreclosure: bool = None,
|
||||||
extra_property_data: bool = True,
|
extra_property_data: bool = True,
|
||||||
|
exclude_pending: bool = False,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Scrape properties from Realtor.com based on a given location and listing type.
|
Scrape properties from Realtor.com based on a given location and listing type.
|
||||||
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
|
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
|
||||||
:param listing_type: Listing Type (for_sale, for_rent, sold)
|
:param listing_type: Listing Type (for_sale, for_rent, sold, pending)
|
||||||
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
|
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
|
||||||
:param mls_only: If set, fetches only listings with MLS IDs.
|
:param mls_only: If set, fetches only listings with MLS IDs.
|
||||||
:param proxy: Proxy to use for scraping
|
:param proxy: Proxy to use for scraping
|
||||||
@@ -29,6 +30,7 @@ def scrape_property(
|
|||||||
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
|
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
|
||||||
:param foreclosure: If set, fetches only foreclosure listings.
|
:param foreclosure: If set, fetches only foreclosure listings.
|
||||||
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
||||||
|
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
|
||||||
"""
|
"""
|
||||||
validate_input(listing_type)
|
validate_input(listing_type)
|
||||||
validate_dates(date_from, date_to)
|
validate_dates(date_from, date_to)
|
||||||
@@ -44,16 +46,17 @@ def scrape_property(
|
|||||||
date_to=date_to,
|
date_to=date_to,
|
||||||
foreclosure=foreclosure,
|
foreclosure=foreclosure,
|
||||||
extra_property_data=extra_property_data,
|
extra_property_data=extra_property_data,
|
||||||
|
exclude_pending=exclude_pending,
|
||||||
)
|
)
|
||||||
|
|
||||||
site = RealtorScraper(scraper_input)
|
site = RealtorScraper(scraper_input)
|
||||||
results = site.search()
|
results = site.search()
|
||||||
|
|
||||||
properties_dfs = [process_result(result) for result in results]
|
properties_dfs = [df for result in results if not (df := process_result(result)).empty]
|
||||||
if not properties_dfs:
|
if not properties_dfs:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("ignore", category=FutureWarning)
|
warnings.simplefilter("ignore", category=FutureWarning)
|
||||||
|
|
||||||
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""})
|
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import requests
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
@@ -5,6 +6,7 @@ from urllib3.util.retry import Retry
|
|||||||
import uuid
|
import uuid
|
||||||
from ...exceptions import AuthenticationError
|
from ...exceptions import AuthenticationError
|
||||||
from .models import Property, ListingType, SiteName
|
from .models import Property, ListingType, SiteName
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -19,6 +21,7 @@ class ScraperInput:
|
|||||||
date_to: str | None = None
|
date_to: str | None = None
|
||||||
foreclosure: bool | None = False
|
foreclosure: bool | None = False
|
||||||
extra_property_data: bool | None = True
|
extra_property_data: bool | None = True
|
||||||
|
exclude_pending: bool | None = False
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
@@ -60,6 +63,7 @@ class Scraper:
|
|||||||
self.date_to = scraper_input.date_to
|
self.date_to = scraper_input.date_to
|
||||||
self.foreclosure = scraper_input.foreclosure
|
self.foreclosure = scraper_input.foreclosure
|
||||||
self.extra_property_data = scraper_input.extra_property_data
|
self.extra_property_data = scraper_input.extra_property_data
|
||||||
|
self.exclude_pending = scraper_input.exclude_pending
|
||||||
|
|
||||||
def search(self) -> list[Property]: ...
|
def search(self) -> list[Property]: ...
|
||||||
|
|
||||||
@@ -70,18 +74,25 @@ class Scraper:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_access_token():
|
def get_access_token():
|
||||||
url = "https://graph.realtor.com/auth/token"
|
device_id = str(uuid.uuid4()).upper()
|
||||||
|
|
||||||
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}'
|
response = requests.post(
|
||||||
headers = {
|
"https://graph.realtor.com/auth/token",
|
||||||
"Host": "graph.realtor.com",
|
headers={
|
||||||
"x-client-version": "24.20.4.149916",
|
'Host': 'graph.realtor.com',
|
||||||
"accept": "*/*",
|
'Accept': '*/*',
|
||||||
"content-type": "Application/json",
|
'Content-Type': 'Application/json',
|
||||||
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0",
|
'X-Client-ID': 'rdc_mobile_native,iphone',
|
||||||
"accept-language": "en-US,en;q=0.9",
|
'X-Visitor-ID': device_id,
|
||||||
}
|
'X-Client-Version': '24.21.23.679885',
|
||||||
response = requests.post(url, headers=headers, data=payload)
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0',
|
||||||
|
},
|
||||||
|
data=json.dumps({
|
||||||
|
"grant_type": "device_mobile",
|
||||||
|
"device_id": device_id,
|
||||||
|
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone"
|
||||||
|
}))
|
||||||
|
|
||||||
data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -36,6 +37,7 @@ class PropertyType(Enum):
|
|||||||
CONDO_TOWNHOME = "CONDO_TOWNHOME"
|
CONDO_TOWNHOME = "CONDO_TOWNHOME"
|
||||||
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
|
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
|
||||||
CONDO = "CONDO"
|
CONDO = "CONDO"
|
||||||
|
CONDOP = "CONDOP"
|
||||||
CONDOS = "CONDOS"
|
CONDOS = "CONDOS"
|
||||||
COOP = "COOP"
|
COOP = "COOP"
|
||||||
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
|
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ homeharvest.realtor.__init__
|
|||||||
|
|
||||||
This module implements the scraper for realtor.com
|
This module implements the scraper for realtor.com
|
||||||
"""
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, Union, Optional
|
from typing import Dict, Union, Optional
|
||||||
@@ -166,7 +166,7 @@ class RealtorScraper(Scraper):
|
|||||||
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
|
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
|
||||||
address=self._parse_address(property_info, search_type="handle_listing"),
|
address=self._parse_address(property_info, search_type="handle_listing"),
|
||||||
description=Description(
|
description=Description(
|
||||||
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])),
|
alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
|
||||||
style=property_info["basic"].get("type", "").upper(),
|
style=property_info["basic"].get("type", "").upper(),
|
||||||
beds=property_info["basic"].get("beds"),
|
beds=property_info["basic"].get("beds"),
|
||||||
baths_full=property_info["basic"].get("baths_full"),
|
baths_full=property_info["basic"].get("baths_full"),
|
||||||
@@ -525,7 +525,7 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
|
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
|
||||||
|
|
||||||
if is_pending and self.listing_type != ListingType.PENDING:
|
if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING):
|
||||||
return
|
return
|
||||||
|
|
||||||
property_id = result["property_id"]
|
property_id = result["property_id"]
|
||||||
@@ -805,8 +805,8 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
return Description(
|
return Description(
|
||||||
primary_photo=primary_photo,
|
primary_photo=primary_photo,
|
||||||
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
|
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
|
||||||
style=PropertyType(style) if style else None,
|
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
|
||||||
beds=description_data.get("beds"),
|
beds=description_data.get("beds"),
|
||||||
baths_full=description_data.get("baths_full"),
|
baths_full=description_data.get("baths_full"),
|
||||||
baths_half=description_data.get("baths_half"),
|
baths_half=description_data.get("baths_half"),
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import annotations
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from .core.scrapers.models import Property, ListingType, Agent
|
from .core.scrapers.models import Property, ListingType, Agent
|
||||||
@@ -83,7 +84,7 @@ def process_result(result: Property) -> pd.DataFrame:
|
|||||||
|
|
||||||
description = result.description
|
description = result.description
|
||||||
prop_data["primary_photo"] = description.primary_photo
|
prop_data["primary_photo"] = description.primary_photo
|
||||||
prop_data["alt_photos"] = ", ".join(description.alt_photos)
|
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
|
||||||
prop_data["style"] = description.style if type(description.style) == str else description.style.value
|
prop_data["style"] = description.style if type(description.style) == str else description.style.value
|
||||||
prop_data["beds"] = description.beds
|
prop_data["beds"] = description.beds
|
||||||
prop_data["full_baths"] = description.baths_full
|
prop_data["full_baths"] = description.baths_full
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.3.23"
|
version = "0.3.29"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/HomeHarvest"
|
homepage = "https://github.com/Bunsly/HomeHarvest"
|
||||||
@@ -10,7 +10,7 @@ readme = "README.md"
|
|||||||
homeharvest = "homeharvest.cli:main"
|
homeharvest = "homeharvest.cli:main"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.10,<3.13"
|
python = ">=3.9,<3.13"
|
||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
pandas = "^2.1.1"
|
pandas = "^2.1.1"
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from homeharvest import scrape_property
|
|||||||
def test_realtor_pending_or_contingent():
|
def test_realtor_pending_or_contingent():
|
||||||
pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending")
|
pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending")
|
||||||
|
|
||||||
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale")
|
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale", exclude_pending=True)
|
||||||
|
|
||||||
assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
|
assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
|
||||||
assert len(pending_or_contingent_result) != len(regular_result)
|
assert len(pending_or_contingent_result) != len(regular_result)
|
||||||
@@ -155,7 +155,7 @@ def test_realtor_without_extra_details():
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert results[0] != results[1]
|
assert not results[0].equals(results[1])
|
||||||
|
|
||||||
|
|
||||||
def test_pr_zip_code():
|
def test_pr_zip_code():
|
||||||
@@ -165,3 +165,13 @@ def test_pr_zip_code():
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert results is not None and len(results) > 0
|
assert results is not None and len(results) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_exclude_pending():
|
||||||
|
results = scrape_property(
|
||||||
|
location="33567",
|
||||||
|
listing_type="pending",
|
||||||
|
exclude_pending=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results is not None and len(results) > 0
|
||||||
|
|||||||
Reference in New Issue
Block a user