2023-10-04 08:11:53 -07:00
|
|
|
import warnings
|
2023-09-18 14:18:22 -07:00
|
|
|
import pandas as pd
|
|
|
|
from .core.scrapers import ScraperInput
|
2024-07-15 07:19:57 -07:00
|
|
|
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
|
2023-09-15 20:58:54 -07:00
|
|
|
from .core.scrapers.realtor import RealtorScraper
|
2024-11-03 15:23:07 -08:00
|
|
|
from .core.scrapers.models import ListingType, SearchPropertyType
|
2023-09-15 15:17:37 -07:00
|
|
|
|
2023-09-18 20:28:03 -07:00
|
|
|
|
2023-10-04 08:11:53 -07:00
|
|
|
def scrape_property(
|
|
|
|
location: str,
|
|
|
|
listing_type: str = "for_sale",
|
2024-11-03 15:23:07 -08:00
|
|
|
property_type: list[str] | None = None,
|
2023-10-04 08:11:53 -07:00
|
|
|
radius: float = None,
|
|
|
|
mls_only: bool = False,
|
2023-10-04 21:35:14 -07:00
|
|
|
past_days: int = None,
|
2023-10-04 08:11:53 -07:00
|
|
|
proxy: str = None,
|
2024-05-02 08:48:53 -07:00
|
|
|
date_from: str = None, #: TODO: Switch to one parameter, Date, with date_from and date_to, pydantic validation
|
2023-11-03 16:35:41 -07:00
|
|
|
date_to: str = None,
|
2024-03-03 09:45:28 -08:00
|
|
|
foreclosure: bool = None,
|
2024-05-02 08:48:53 -07:00
|
|
|
extra_property_data: bool = True,
|
2024-05-31 22:17:29 -07:00
|
|
|
exclude_pending: bool = False,
|
2024-07-15 07:19:57 -07:00
|
|
|
limit: int = 10000,
|
2023-10-04 08:11:53 -07:00
|
|
|
) -> pd.DataFrame:
|
2023-09-18 08:37:07 -07:00
|
|
|
"""
|
2023-10-04 08:11:53 -07:00
|
|
|
Scrape properties from Realtor.com based on a given location and listing type.
|
2023-10-04 18:06:06 -07:00
|
|
|
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
|
2024-05-31 22:17:29 -07:00
|
|
|
:param listing_type: Listing Type (for_sale, for_rent, sold, pending)
|
2024-11-03 15:23:07 -08:00
|
|
|
:param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
|
2023-10-04 18:06:06 -07:00
|
|
|
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
|
|
|
|
:param mls_only: If set, fetches only listings with MLS IDs.
|
2024-05-02 08:48:53 -07:00
|
|
|
:param proxy: Proxy to use for scraping
|
2023-10-04 21:35:14 -07:00
|
|
|
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
|
2023-11-03 16:35:41 -07:00
|
|
|
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
|
2024-05-02 08:48:53 -07:00
|
|
|
:param foreclosure: If set, fetches only foreclosure listings.
|
|
|
|
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
|
2024-05-31 22:17:29 -07:00
|
|
|
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
|
2024-07-15 07:19:57 -07:00
|
|
|
:param limit: Limit the number of results returned. Maximum is 10,000.
|
2023-09-18 08:37:07 -07:00
|
|
|
"""
|
2023-10-04 08:11:53 -07:00
|
|
|
validate_input(listing_type)
|
2023-11-03 16:35:41 -07:00
|
|
|
validate_dates(date_from, date_to)
|
2024-07-15 07:19:57 -07:00
|
|
|
validate_limit(limit)
|
2023-09-17 16:52:34 -07:00
|
|
|
|
2023-09-15 15:17:37 -07:00
|
|
|
scraper_input = ScraperInput(
|
|
|
|
location=location,
|
|
|
|
listing_type=ListingType[listing_type.upper()],
|
2024-11-03 15:23:07 -08:00
|
|
|
property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
|
2023-09-19 13:43:24 -07:00
|
|
|
proxy=proxy,
|
2023-10-02 13:58:47 -07:00
|
|
|
radius=radius,
|
2023-10-04 08:11:53 -07:00
|
|
|
mls_only=mls_only,
|
2023-10-04 21:35:14 -07:00
|
|
|
last_x_days=past_days,
|
2023-11-03 16:35:41 -07:00
|
|
|
date_from=date_from,
|
|
|
|
date_to=date_to,
|
2024-03-03 09:45:28 -08:00
|
|
|
foreclosure=foreclosure,
|
2024-05-02 09:04:49 -07:00
|
|
|
extra_property_data=extra_property_data,
|
2024-05-31 22:17:29 -07:00
|
|
|
exclude_pending=exclude_pending,
|
2024-07-15 07:19:57 -07:00
|
|
|
limit=limit,
|
2023-09-15 15:17:37 -07:00
|
|
|
)
|
|
|
|
|
2023-10-04 08:11:53 -07:00
|
|
|
site = RealtorScraper(scraper_input)
|
2023-09-17 16:30:37 -07:00
|
|
|
results = site.search()
|
2023-09-15 15:17:37 -07:00
|
|
|
|
2024-05-20 12:13:30 -07:00
|
|
|
properties_dfs = [df for result in results if not (df := process_result(result)).empty]
|
2023-09-18 11:38:17 -07:00
|
|
|
if not properties_dfs:
|
2023-11-03 16:35:41 -07:00
|
|
|
return pd.DataFrame()
|
2023-09-18 20:28:03 -07:00
|
|
|
|
2023-10-04 08:11:53 -07:00
|
|
|
with warnings.catch_warnings():
|
|
|
|
warnings.simplefilter("ignore", category=FutureWarning)
|
2024-05-02 08:48:53 -07:00
|
|
|
|
2024-11-03 15:23:07 -08:00
|
|
|
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace(
|
|
|
|
{"None": pd.NA, None: pd.NA, "": pd.NA}
|
|
|
|
)
|