mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 20:14:30 -08:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6d14b8df5a | ||
|
|
3f44744d61 |
@@ -94,7 +94,9 @@ Optional
|
|||||||
│
|
│
|
||||||
├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
||||||
│
|
│
|
||||||
└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
|
├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
|
||||||
|
│
|
||||||
|
└── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
|
||||||
```
|
```
|
||||||
|
|
||||||
### Property Schema
|
### Property Schema
|
||||||
@@ -126,6 +128,8 @@ Property
|
|||||||
├── Property Listing Details:
|
├── Property Listing Details:
|
||||||
│ ├── days_on_mls
|
│ ├── days_on_mls
|
||||||
│ ├── list_price
|
│ ├── list_price
|
||||||
|
│ ├── list_price_min
|
||||||
|
│ ├── list_price_max
|
||||||
│ ├── list_date
|
│ ├── list_date
|
||||||
│ ├── pending_date
|
│ ├── pending_date
|
||||||
│ ├── sold_price
|
│ ├── sold_price
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import warnings
|
import warnings
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from .core.scrapers import ScraperInput
|
from .core.scrapers import ScraperInput
|
||||||
from .utils import process_result, ordered_properties, validate_input, validate_dates
|
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
|
||||||
from .core.scrapers.realtor import RealtorScraper
|
from .core.scrapers.realtor import RealtorScraper
|
||||||
from .core.scrapers.models import ListingType
|
from .core.scrapers.models import ListingType
|
||||||
|
|
||||||
@@ -18,6 +18,7 @@ def scrape_property(
|
|||||||
foreclosure: bool = None,
|
foreclosure: bool = None,
|
||||||
extra_property_data: bool = True,
|
extra_property_data: bool = True,
|
||||||
exclude_pending: bool = False,
|
exclude_pending: bool = False,
|
||||||
|
limit: int = 10000,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Scrape properties from Realtor.com based on a given location and listing type.
|
Scrape properties from Realtor.com based on a given location and listing type.
|
||||||
@@ -31,9 +32,11 @@ def scrape_property(
|
|||||||
:param foreclosure: If set, fetches only foreclosure listings.
|
:param foreclosure: If set, fetches only foreclosure listings.
|
||||||
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
||||||
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
|
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
|
||||||
|
:param limit: Limit the number of results returned. Maximum is 10,000.
|
||||||
"""
|
"""
|
||||||
validate_input(listing_type)
|
validate_input(listing_type)
|
||||||
validate_dates(date_from, date_to)
|
validate_dates(date_from, date_to)
|
||||||
|
validate_limit(limit)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
location=location,
|
location=location,
|
||||||
@@ -47,6 +50,7 @@ def scrape_property(
|
|||||||
foreclosure=foreclosure,
|
foreclosure=foreclosure,
|
||||||
extra_property_data=extra_property_data,
|
extra_property_data=extra_property_data,
|
||||||
exclude_pending=exclude_pending,
|
exclude_pending=exclude_pending,
|
||||||
|
limit=limit,
|
||||||
)
|
)
|
||||||
|
|
||||||
site = RealtorScraper(scraper_input)
|
site = RealtorScraper(scraper_input)
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ class ScraperInput:
|
|||||||
foreclosure: bool | None = False
|
foreclosure: bool | None = False
|
||||||
extra_property_data: bool | None = True
|
extra_property_data: bool | None = True
|
||||||
exclude_pending: bool | None = False
|
exclude_pending: bool | None = False
|
||||||
|
limit: int = 10000
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
@@ -64,6 +65,7 @@ class Scraper:
|
|||||||
self.foreclosure = scraper_input.foreclosure
|
self.foreclosure = scraper_input.foreclosure
|
||||||
self.extra_property_data = scraper_input.extra_property_data
|
self.extra_property_data = scraper_input.extra_property_data
|
||||||
self.exclude_pending = scraper_input.exclude_pending
|
self.exclude_pending = scraper_input.exclude_pending
|
||||||
|
self.limit = scraper_input.limit
|
||||||
|
|
||||||
def search(self) -> list[Property]: ...
|
def search(self) -> list[Property]: ...
|
||||||
|
|
||||||
|
|||||||
@@ -113,6 +113,9 @@ class Property:
|
|||||||
address: Address | None = None
|
address: Address | None = None
|
||||||
|
|
||||||
list_price: int | None = None
|
list_price: int | None = None
|
||||||
|
list_price_min: int | None = None
|
||||||
|
list_price_max: int | None = None
|
||||||
|
|
||||||
list_date: str | None = None
|
list_date: str | None = None
|
||||||
pending_date: str | None = None
|
pending_date: str | None = None
|
||||||
last_sold_date: str | None = None
|
last_sold_date: str | None = None
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ class RealtorScraper(Scraper):
|
|||||||
PROPERTY_GQL = "https://graph.realtor.com/graphql"
|
PROPERTY_GQL = "https://graph.realtor.com/graphql"
|
||||||
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
|
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
|
||||||
NUM_PROPERTY_WORKERS = 20
|
NUM_PROPERTY_WORKERS = 20
|
||||||
|
DEFAULT_PAGE_SIZE = 200
|
||||||
|
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
@@ -76,7 +77,6 @@ class RealtorScraper(Scraper):
|
|||||||
baths_half
|
baths_half
|
||||||
lot_sqft
|
lot_sqft
|
||||||
sold_price
|
sold_price
|
||||||
sold_price
|
|
||||||
type
|
type
|
||||||
price
|
price
|
||||||
status
|
status
|
||||||
@@ -115,10 +115,10 @@ class RealtorScraper(Scraper):
|
|||||||
)
|
)
|
||||||
|
|
||||||
able_to_get_lat_long = (
|
able_to_get_lat_long = (
|
||||||
property_info
|
property_info
|
||||||
and property_info.get("address")
|
and property_info.get("address")
|
||||||
and property_info["address"].get("location")
|
and property_info["address"].get("location")
|
||||||
and property_info["address"]["location"].get("coordinate")
|
and property_info["address"]["location"].get("coordinate")
|
||||||
)
|
)
|
||||||
list_date_str = (
|
list_date_str = (
|
||||||
property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None
|
property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None
|
||||||
@@ -326,6 +326,8 @@ class RealtorScraper(Scraper):
|
|||||||
last_sold_price
|
last_sold_price
|
||||||
last_sold_date
|
last_sold_date
|
||||||
list_price
|
list_price
|
||||||
|
list_price_max
|
||||||
|
list_price_min
|
||||||
price_per_sqft
|
price_per_sqft
|
||||||
flags {
|
flags {
|
||||||
is_contingent
|
is_contingent
|
||||||
@@ -481,7 +483,7 @@ class RealtorScraper(Scraper):
|
|||||||
)
|
)
|
||||||
else: #: general search, came from an address
|
else: #: general search, came from an address
|
||||||
query = (
|
query = (
|
||||||
"""query Property_search(
|
"""query Property_search(
|
||||||
$property_id: [ID]!
|
$property_id: [ID]!
|
||||||
$offset: Int!,
|
$offset: Int!,
|
||||||
) {
|
) {
|
||||||
@@ -492,7 +494,7 @@ class RealtorScraper(Scraper):
|
|||||||
limit: 1
|
limit: 1
|
||||||
offset: $offset
|
offset: $offset
|
||||||
) %s"""
|
) %s"""
|
||||||
% results_query
|
% results_query
|
||||||
)
|
)
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
@@ -507,12 +509,12 @@ class RealtorScraper(Scraper):
|
|||||||
properties: list[Property] = []
|
properties: list[Property] = []
|
||||||
|
|
||||||
if (
|
if (
|
||||||
response_json is None
|
response_json is None
|
||||||
or "data" not in response_json
|
or "data" not in response_json
|
||||||
or response_json["data"] is None
|
or response_json["data"] is None
|
||||||
or search_key not in response_json["data"]
|
or search_key not in response_json["data"]
|
||||||
or response_json["data"][search_key] is None
|
or response_json["data"][search_key] is None
|
||||||
or "results" not in response_json["data"][search_key]
|
or "results" not in response_json["data"][search_key]
|
||||||
):
|
):
|
||||||
return {"total": 0, "properties": []}
|
return {"total": 0, "properties": []}
|
||||||
|
|
||||||
@@ -523,10 +525,10 @@ class RealtorScraper(Scraper):
|
|||||||
return
|
return
|
||||||
|
|
||||||
able_to_get_lat_long = (
|
able_to_get_lat_long = (
|
||||||
result
|
result
|
||||||
and result.get("location")
|
and result.get("location")
|
||||||
and result["location"].get("address")
|
and result["location"].get("address")
|
||||||
and result["location"]["address"].get("coordinate")
|
and result["location"]["address"].get("coordinate")
|
||||||
)
|
)
|
||||||
|
|
||||||
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
|
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
|
||||||
@@ -551,6 +553,8 @@ class RealtorScraper(Scraper):
|
|||||||
),
|
),
|
||||||
status="PENDING" if is_pending else result["status"].upper(),
|
status="PENDING" if is_pending else result["status"].upper(),
|
||||||
list_price=result["list_price"],
|
list_price=result["list_price"],
|
||||||
|
list_price_min=result["list_price_min"],
|
||||||
|
list_price_max=result["list_price_max"],
|
||||||
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
|
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
|
||||||
prc_sqft=result.get("price_per_sqft"),
|
prc_sqft=result.get("price_per_sqft"),
|
||||||
last_sold_date=result.get("last_sold_date"),
|
last_sold_date=result.get("last_sold_date"),
|
||||||
@@ -571,9 +575,17 @@ class RealtorScraper(Scraper):
|
|||||||
)
|
)
|
||||||
return realty_property
|
return realty_property
|
||||||
|
|
||||||
|
properties_list = response_json["data"][search_key]["results"]
|
||||||
|
total_properties = response_json["data"][search_key]["total"]
|
||||||
|
offset = variables.get("offset", 0)
|
||||||
|
|
||||||
|
#: limit the number of properties to be processed
|
||||||
|
#: example, if your offset is 200, and your limit is 250, return 50
|
||||||
|
properties_list = properties_list[:self.limit - offset]
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
|
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
|
||||||
futures = [
|
futures = [
|
||||||
executor.submit(process_property, result) for result in response_json["data"][search_key]["results"]
|
executor.submit(process_property, result) for result in properties_list
|
||||||
]
|
]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
@@ -582,7 +594,7 @@ class RealtorScraper(Scraper):
|
|||||||
properties.append(result)
|
properties.append(result)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"total": response_json["data"][search_key]["total"],
|
"total": total_properties,
|
||||||
"properties": properties,
|
"properties": properties,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -654,7 +666,7 @@ class RealtorScraper(Scraper):
|
|||||||
variables=search_variables | {"offset": i},
|
variables=search_variables | {"offset": i},
|
||||||
search_type=search_type,
|
search_type=search_type,
|
||||||
)
|
)
|
||||||
for i in range(200, min(total, 10000), 200)
|
for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE)
|
||||||
]
|
]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
@@ -790,7 +802,10 @@ class RealtorScraper(Scraper):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_description(result: dict) -> Description:
|
def _parse_description(result: dict) -> Description | None:
|
||||||
|
if not result:
|
||||||
|
return None
|
||||||
|
|
||||||
description_data = result.get("description", {})
|
description_data = result.get("description", {})
|
||||||
|
|
||||||
if description_data is None or not isinstance(description_data, dict):
|
if description_data is None or not isinstance(description_data, dict):
|
||||||
@@ -801,11 +816,8 @@ class RealtorScraper(Scraper):
|
|||||||
style = style.upper()
|
style = style.upper()
|
||||||
|
|
||||||
primary_photo = ""
|
primary_photo = ""
|
||||||
if result and "primary_photo" in result:
|
if (primary_photo_info := result.get('primary_photo')) and (primary_photo_href := primary_photo_info.get("href")):
|
||||||
primary_photo_info = result["primary_photo"]
|
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
|
||||||
if primary_photo_info and "href" in primary_photo_info:
|
|
||||||
primary_photo_href = primary_photo_info["href"]
|
|
||||||
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
|
|
||||||
|
|
||||||
return Description(
|
return Description(
|
||||||
primary_photo=primary_photo,
|
primary_photo=primary_photo,
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ ordered_properties = [
|
|||||||
"year_built",
|
"year_built",
|
||||||
"days_on_mls",
|
"days_on_mls",
|
||||||
"list_price",
|
"list_price",
|
||||||
|
"list_price_min",
|
||||||
|
"list_price_max",
|
||||||
"list_date",
|
"list_date",
|
||||||
"sold_price",
|
"sold_price",
|
||||||
"last_sold_date",
|
"last_sold_date",
|
||||||
@@ -86,7 +88,8 @@ def process_result(result: Property) -> pd.DataFrame:
|
|||||||
if description:
|
if description:
|
||||||
prop_data["primary_photo"] = description.primary_photo
|
prop_data["primary_photo"] = description.primary_photo
|
||||||
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
|
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
|
||||||
prop_data["style"] = description.style if isinstance(description.style, str) else description.style.value if description.style else None
|
prop_data["style"] = description.style if isinstance(description.style,
|
||||||
|
str) else description.style.value if description.style else None
|
||||||
prop_data["beds"] = description.beds
|
prop_data["beds"] = description.beds
|
||||||
prop_data["full_baths"] = description.baths_full
|
prop_data["full_baths"] = description.baths_full
|
||||||
prop_data["half_baths"] = description.baths_half
|
prop_data["half_baths"] = description.baths_half
|
||||||
@@ -110,7 +113,7 @@ def validate_input(listing_type: str) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
||||||
if (date_from is not None and date_to is None) or (date_from is None and date_to is not None):
|
if isinstance(date_from, str) != isinstance(date_to, str):
|
||||||
raise InvalidDate("Both date_from and date_to must be provided.")
|
raise InvalidDate("Both date_from and date_to must be provided.")
|
||||||
|
|
||||||
if date_from and date_to:
|
if date_from and date_to:
|
||||||
@@ -122,3 +125,10 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None:
|
|||||||
raise InvalidDate("date_to must be after date_from.")
|
raise InvalidDate("date_to must be after date_from.")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise InvalidDate(f"Invalid date format or range")
|
raise InvalidDate(f"Invalid date format or range")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_limit(limit: int) -> None:
|
||||||
|
#: 1 -> 10000 limit
|
||||||
|
|
||||||
|
if limit is not None and (limit < 1 or limit > 10000):
|
||||||
|
raise ValueError("Property limit must be between 1 and 10,000.")
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.3.32"
|
version = "0.3.34"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/HomeHarvest"
|
homepage = "https://github.com/Bunsly/HomeHarvest"
|
||||||
|
|||||||
@@ -105,8 +105,8 @@ def test_realtor():
|
|||||||
location="2530 Al Lipscomb Way",
|
location="2530 Al Lipscomb Way",
|
||||||
listing_type="for_sale",
|
listing_type="for_sale",
|
||||||
),
|
),
|
||||||
scrape_property(location="Phoenix, AZ", listing_type="for_rent"), #: does not support "city, state, USA" format
|
scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format
|
||||||
scrape_property(location="Dallas, TX", listing_type="sold"), #: does not support "city, state, USA" format
|
scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format
|
||||||
scrape_property(location="85281"),
|
scrape_property(location="85281"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -117,6 +117,7 @@ def test_realtor_city():
|
|||||||
results = scrape_property(
|
results = scrape_property(
|
||||||
location="Atlanta, GA",
|
location="Atlanta, GA",
|
||||||
listing_type="for_sale",
|
listing_type="for_sale",
|
||||||
|
limit=1000
|
||||||
)
|
)
|
||||||
|
|
||||||
assert results is not None and len(results) > 0
|
assert results is not None and len(results) > 0
|
||||||
@@ -140,7 +141,7 @@ def test_realtor_foreclosed():
|
|||||||
|
|
||||||
|
|
||||||
def test_realtor_agent():
|
def test_realtor_agent():
|
||||||
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale")
|
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000)
|
||||||
assert scraped["agent"].nunique() > 1
|
assert scraped["agent"].nunique() > 1
|
||||||
|
|
||||||
|
|
||||||
@@ -182,6 +183,58 @@ def test_style_value_error():
|
|||||||
location="Alaska, AK",
|
location="Alaska, AK",
|
||||||
listing_type="sold",
|
listing_type="sold",
|
||||||
extra_property_data=False,
|
extra_property_data=False,
|
||||||
|
limit=1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert results is not None and len(results) > 0
|
assert results is not None and len(results) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_primary_image_error():
|
||||||
|
results = scrape_property(
|
||||||
|
location="Spokane, PA",
|
||||||
|
listing_type="for_rent", # or (for_sale, for_rent, pending)
|
||||||
|
past_days=360,
|
||||||
|
radius=3,
|
||||||
|
extra_property_data=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results is not None and len(results) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_limit():
|
||||||
|
over_limit = 876
|
||||||
|
extra_params = {"limit": over_limit}
|
||||||
|
|
||||||
|
over_results = scrape_property(
|
||||||
|
location="Waddell, AZ",
|
||||||
|
listing_type="for_sale",
|
||||||
|
**extra_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert over_results is not None and len(over_results) <= over_limit
|
||||||
|
|
||||||
|
under_limit = 1
|
||||||
|
under_results = scrape_property(
|
||||||
|
location="Waddell, AZ",
|
||||||
|
listing_type="for_sale",
|
||||||
|
limit=under_limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert under_results is not None and len(under_results) == under_limit
|
||||||
|
|
||||||
|
|
||||||
|
def test_apartment_list_price():
|
||||||
|
results = scrape_property(
|
||||||
|
location="Spokane, WA",
|
||||||
|
listing_type="for_rent", # or (for_sale, for_rent, pending)
|
||||||
|
extra_property_data=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results is not None
|
||||||
|
|
||||||
|
results = results[results["style"] == "APARTMENT"]
|
||||||
|
|
||||||
|
#: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max
|
||||||
|
assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(
|
||||||
|
results
|
||||||
|
) > 0.5
|
||||||
|
|||||||
Reference in New Issue
Block a user