Compare commits

...

2 Commits

Author SHA1 Message Date
Zachary Hampton
6d14b8df5a - fix limit parameter
- fix specific for_rent apartment listing prices
2024-08-13 10:44:11 -07:00
Zachary Hampton
3f44744d61 - primary photo bug fix
- limit parameter
2024-07-15 07:19:57 -07:00
8 changed files with 123 additions and 35 deletions

View File

@@ -94,7 +94,9 @@ Optional
├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) ├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' ── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
└── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
``` ```
### Property Schema ### Property Schema
@@ -126,6 +128,8 @@ Property
├── Property Listing Details: ├── Property Listing Details:
│ ├── days_on_mls │ ├── days_on_mls
│ ├── list_price │ ├── list_price
│ ├── list_price_min
│ ├── list_price_max
│ ├── list_date │ ├── list_date
│ ├── pending_date │ ├── pending_date
│ ├── sold_price │ ├── sold_price

View File

@@ -1,7 +1,7 @@
import warnings import warnings
import pandas as pd import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType from .core.scrapers.models import ListingType
@@ -18,6 +18,7 @@ def scrape_property(
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False, exclude_pending: bool = False,
limit: int = 10000,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@@ -31,9 +32,11 @@ def scrape_property(
:param foreclosure: If set, fetches only foreclosure listings. :param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending. :param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
:param limit: Limit the number of results returned. Maximum is 10,000.
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
validate_limit(limit)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
@@ -47,6 +50,7 @@ def scrape_property(
foreclosure=foreclosure, foreclosure=foreclosure,
extra_property_data=extra_property_data, extra_property_data=extra_property_data,
exclude_pending=exclude_pending, exclude_pending=exclude_pending,
limit=limit,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)

View File

@@ -22,6 +22,7 @@ class ScraperInput:
foreclosure: bool | None = False foreclosure: bool | None = False
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False exclude_pending: bool | None = False
limit: int = 10000
class Scraper: class Scraper:
@@ -64,6 +65,7 @@ class Scraper:
self.foreclosure = scraper_input.foreclosure self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit
def search(self) -> list[Property]: ... def search(self) -> list[Property]: ...

View File

@@ -113,6 +113,9 @@ class Property:
address: Address | None = None address: Address | None = None
list_price: int | None = None list_price: int | None = None
list_price_min: int | None = None
list_price_max: int | None = None
list_date: str | None = None list_date: str | None = None
pending_date: str | None = None pending_date: str | None = None
last_sold_date: str | None = None last_sold_date: str | None = None

View File

@@ -20,6 +20,7 @@ class RealtorScraper(Scraper):
PROPERTY_GQL = "https://graph.realtor.com/graphql" PROPERTY_GQL = "https://graph.realtor.com/graphql"
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest" ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
NUM_PROPERTY_WORKERS = 20 NUM_PROPERTY_WORKERS = 20
DEFAULT_PAGE_SIZE = 200
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
@@ -76,7 +77,6 @@ class RealtorScraper(Scraper):
baths_half baths_half
lot_sqft lot_sqft
sold_price sold_price
sold_price
type type
price price
status status
@@ -326,6 +326,8 @@ class RealtorScraper(Scraper):
last_sold_price last_sold_price
last_sold_date last_sold_date
list_price list_price
list_price_max
list_price_min
price_per_sqft price_per_sqft
flags { flags {
is_contingent is_contingent
@@ -551,6 +553,8 @@ class RealtorScraper(Scraper):
), ),
status="PENDING" if is_pending else result["status"].upper(), status="PENDING" if is_pending else result["status"].upper(),
list_price=result["list_price"], list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None, list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
prc_sqft=result.get("price_per_sqft"), prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"), last_sold_date=result.get("last_sold_date"),
@@ -571,9 +575,17 @@ class RealtorScraper(Scraper):
) )
return realty_property return realty_property
properties_list = response_json["data"][search_key]["results"]
total_properties = response_json["data"][search_key]["total"]
offset = variables.get("offset", 0)
#: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50
properties_list = properties_list[:self.limit - offset]
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [ futures = [
executor.submit(process_property, result) for result in response_json["data"][search_key]["results"] executor.submit(process_property, result) for result in properties_list
] ]
for future in as_completed(futures): for future in as_completed(futures):
@@ -582,7 +594,7 @@ class RealtorScraper(Scraper):
properties.append(result) properties.append(result)
return { return {
"total": response_json["data"][search_key]["total"], "total": total_properties,
"properties": properties, "properties": properties,
} }
@@ -654,7 +666,7 @@ class RealtorScraper(Scraper):
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
search_type=search_type, search_type=search_type,
) )
for i in range(200, min(total, 10000), 200) for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE)
] ]
for future in as_completed(futures): for future in as_completed(futures):
@@ -790,7 +802,10 @@ class RealtorScraper(Scraper):
) )
@staticmethod @staticmethod
def _parse_description(result: dict) -> Description: def _parse_description(result: dict) -> Description | None:
if not result:
return None
description_data = result.get("description", {}) description_data = result.get("description", {})
if description_data is None or not isinstance(description_data, dict): if description_data is None or not isinstance(description_data, dict):
@@ -801,10 +816,7 @@ class RealtorScraper(Scraper):
style = style.upper() style = style.upper()
primary_photo = "" primary_photo = ""
if result and "primary_photo" in result: if (primary_photo_info := result.get('primary_photo')) and (primary_photo_href := primary_photo_info.get("href")):
primary_photo_info = result["primary_photo"]
if primary_photo_info and "href" in primary_photo_info:
primary_photo_href = primary_photo_info["href"]
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
return Description( return Description(

View File

@@ -24,6 +24,8 @@ ordered_properties = [
"year_built", "year_built",
"days_on_mls", "days_on_mls",
"list_price", "list_price",
"list_price_min",
"list_price_max",
"list_date", "list_date",
"sold_price", "sold_price",
"last_sold_date", "last_sold_date",
@@ -86,7 +88,8 @@ def process_result(result: Property) -> pd.DataFrame:
if description: if description:
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if isinstance(description.style, str) else description.style.value if description.style else None prop_data["style"] = description.style if isinstance(description.style,
str) else description.style.value if description.style else None
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half
@@ -110,7 +113,7 @@ def validate_input(listing_type: str) -> None:
def validate_dates(date_from: str | None, date_to: str | None) -> None: def validate_dates(date_from: str | None, date_to: str | None) -> None:
if (date_from is not None and date_to is None) or (date_from is None and date_to is not None): if isinstance(date_from, str) != isinstance(date_to, str):
raise InvalidDate("Both date_from and date_to must be provided.") raise InvalidDate("Both date_from and date_to must be provided.")
if date_from and date_to: if date_from and date_to:
@@ -122,3 +125,10 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None:
raise InvalidDate("date_to must be after date_from.") raise InvalidDate("date_to must be after date_from.")
except ValueError: except ValueError:
raise InvalidDate(f"Invalid date format or range") raise InvalidDate(f"Invalid date format or range")
def validate_limit(limit: int) -> None:
#: 1 -> 10000 limit
if limit is not None and (limit < 1 or limit > 10000):
raise ValueError("Property limit must be between 1 and 10,000.")

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.32" version = "0.3.34"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@@ -105,8 +105,8 @@ def test_realtor():
location="2530 Al Lipscomb Way", location="2530 Al Lipscomb Way",
listing_type="for_sale", listing_type="for_sale",
), ),
scrape_property(location="Phoenix, AZ", listing_type="for_rent"), #: does not support "city, state, USA" format scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format
scrape_property(location="Dallas, TX", listing_type="sold"), #: does not support "city, state, USA" format scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format
scrape_property(location="85281"), scrape_property(location="85281"),
] ]
@@ -117,6 +117,7 @@ def test_realtor_city():
results = scrape_property( results = scrape_property(
location="Atlanta, GA", location="Atlanta, GA",
listing_type="for_sale", listing_type="for_sale",
limit=1000
) )
assert results is not None and len(results) > 0 assert results is not None and len(results) > 0
@@ -140,7 +141,7 @@ def test_realtor_foreclosed():
def test_realtor_agent(): def test_realtor_agent():
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale") scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000)
assert scraped["agent"].nunique() > 1 assert scraped["agent"].nunique() > 1
@@ -182,6 +183,58 @@ def test_style_value_error():
location="Alaska, AK", location="Alaska, AK",
listing_type="sold", listing_type="sold",
extra_property_data=False, extra_property_data=False,
limit=1000,
) )
assert results is not None and len(results) > 0 assert results is not None and len(results) > 0
def test_primary_image_error():
results = scrape_property(
location="Spokane, PA",
listing_type="for_rent", # or (for_sale, for_rent, pending)
past_days=360,
radius=3,
extra_property_data=False,
)
assert results is not None and len(results) > 0
def test_limit():
over_limit = 876
extra_params = {"limit": over_limit}
over_results = scrape_property(
location="Waddell, AZ",
listing_type="for_sale",
**extra_params,
)
assert over_results is not None and len(over_results) <= over_limit
under_limit = 1
under_results = scrape_property(
location="Waddell, AZ",
listing_type="for_sale",
limit=under_limit,
)
assert under_results is not None and len(under_results) == under_limit
def test_apartment_list_price():
results = scrape_property(
location="Spokane, WA",
listing_type="for_rent", # or (for_sale, for_rent, pending)
extra_property_data=False,
)
assert results is not None
results = results[results["style"] == "APARTMENT"]
#: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max
assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(
results
) > 0.5