From 3f44744d6193ab6a30a69e32c160690cc7bdaa94 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 15 Jul 2024 07:19:57 -0700 Subject: [PATCH] - primary photo bug fix - limit parameter --- README.md | 4 +- homeharvest/__init__.py | 6 ++- homeharvest/core/scrapers/__init__.py | 2 + homeharvest/core/scrapers/realtor/__init__.py | 46 +++++++++---------- homeharvest/utils.py | 12 ++++- pyproject.toml | 2 +- 6 files changed, 44 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 565228c..26a8773 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,9 @@ Optional │ ├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) │ -└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' +├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' +│ +└── limit (integer): Limit the number of properties to fetch. Max & default is 10000. ``` ### Property Schema diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index a309591..b4950c2 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,7 +1,7 @@ import warnings import pandas as pd from .core.scrapers import ScraperInput -from .utils import process_result, ordered_properties, validate_input, validate_dates +from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit from .core.scrapers.realtor import RealtorScraper from .core.scrapers.models import ListingType @@ -18,6 +18,7 @@ def scrape_property( foreclosure: bool = None, extra_property_data: bool = True, exclude_pending: bool = False, + limit: int = 10000, ) -> pd.DataFrame: """ Scrape properties from Realtor.com based on a given location and listing type. @@ -31,9 +32,11 @@ def scrape_property( :param foreclosure: If set, fetches only foreclosure listings. :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending. + :param limit: Limit the number of results returned. Maximum is 10,000. """ validate_input(listing_type) validate_dates(date_from, date_to) + validate_limit(limit) scraper_input = ScraperInput( location=location, @@ -47,6 +50,7 @@ def scrape_property( foreclosure=foreclosure, extra_property_data=extra_property_data, exclude_pending=exclude_pending, + limit=limit, ) site = RealtorScraper(scraper_input) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 6f1b9b6..5fd018d 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -22,6 +22,7 @@ class ScraperInput: foreclosure: bool | None = False extra_property_data: bool | None = True exclude_pending: bool | None = False + limit: int = 10000 class Scraper: @@ -64,6 +65,7 @@ class Scraper: self.foreclosure = scraper_input.foreclosure self.extra_property_data = scraper_input.extra_property_data self.exclude_pending = scraper_input.exclude_pending + self.limit = scraper_input.limit def search(self) -> list[Property]: ... diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 761970b..f9d202e 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -115,10 +115,10 @@ class RealtorScraper(Scraper): ) able_to_get_lat_long = ( - property_info - and property_info.get("address") - and property_info["address"].get("location") - and property_info["address"]["location"].get("coordinate") + property_info + and property_info.get("address") + and property_info["address"].get("location") + and property_info["address"]["location"].get("coordinate") ) list_date_str = ( property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None @@ -481,7 +481,7 @@ class RealtorScraper(Scraper): ) else: #: general search, came from an address query = ( - """query Property_search( + """query Property_search( $property_id: [ID]! $offset: Int!, ) { @@ -492,7 +492,7 @@ class RealtorScraper(Scraper): limit: 1 offset: $offset ) %s""" - % results_query + % results_query ) payload = { @@ -507,12 +507,12 @@ class RealtorScraper(Scraper): properties: list[Property] = [] if ( - response_json is None - or "data" not in response_json - or response_json["data"] is None - or search_key not in response_json["data"] - or response_json["data"][search_key] is None - or "results" not in response_json["data"][search_key] + response_json is None + or "data" not in response_json + or response_json["data"] is None + or search_key not in response_json["data"] + or response_json["data"][search_key] is None + or "results" not in response_json["data"][search_key] ): return {"total": 0, "properties": []} @@ -523,10 +523,10 @@ class RealtorScraper(Scraper): return able_to_get_lat_long = ( - result - and result.get("location") - and result["location"].get("address") - and result["location"]["address"].get("coordinate") + result + and result.get("location") + and result["location"].get("address") + and result["location"]["address"].get("coordinate") ) is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") @@ -654,7 +654,7 @@ class RealtorScraper(Scraper): variables=search_variables | {"offset": i}, search_type=search_type, ) - for i in range(200, min(total, 10000), 200) + for i in range(200, min(total, self.limit), 200) ] for future in as_completed(futures): @@ -790,7 +790,10 @@ class RealtorScraper(Scraper): ) @staticmethod - def _parse_description(result: dict) -> Description: + def _parse_description(result: dict) -> Description | None: + if not result: + return None + description_data = result.get("description", {}) if description_data is None or not isinstance(description_data, dict): @@ -801,11 +804,8 @@ class RealtorScraper(Scraper): style = style.upper() primary_photo = "" - if result and "primary_photo" in result: - primary_photo_info = result["primary_photo"] - if primary_photo_info and "href" in primary_photo_info: - primary_photo_href = primary_photo_info["href"] - primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") + if (primary_photo_info := result.get('primary_photo')) and (primary_photo_href := primary_photo_info.get("href")): + primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") return Description( primary_photo=primary_photo, diff --git a/homeharvest/utils.py b/homeharvest/utils.py index 46b75f4..dd21349 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -86,7 +86,8 @@ def process_result(result: Property) -> pd.DataFrame: if description: prop_data["primary_photo"] = description.primary_photo prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None - prop_data["style"] = description.style if isinstance(description.style, str) else description.style.value if description.style else None + prop_data["style"] = description.style if isinstance(description.style, + str) else description.style.value if description.style else None prop_data["beds"] = description.beds prop_data["full_baths"] = description.baths_full prop_data["half_baths"] = description.baths_half @@ -110,7 +111,7 @@ def validate_input(listing_type: str) -> None: def validate_dates(date_from: str | None, date_to: str | None) -> None: - if (date_from is not None and date_to is None) or (date_from is None and date_to is not None): + if isinstance(date_from, str) != isinstance(date_to, str): raise InvalidDate("Both date_from and date_to must be provided.") if date_from and date_to: @@ -122,3 +123,10 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None: raise InvalidDate("date_to must be after date_from.") except ValueError: raise InvalidDate(f"Invalid date format or range") + + +def validate_limit(limit: int) -> None: + #: 1 -> 10000 limit + + if limit is not None and (limit < 1 or limit > 10000): + raise ValueError("Property limit must be between 1 and 10,000.") diff --git a/pyproject.toml b/pyproject.toml index 76efd96..bc1a00d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.3.32" +version = "0.3.33" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest"