From 3ec47c5b6a243736b98736442467bcafb29d0b41 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 20:28:03 -0700 Subject: [PATCH] - invalid test cases - redfin and realtor bug fixes - dupe check bug fix --- homeharvest/__init__.py | 14 +++++++++++++- homeharvest/core/scrapers/realtor/__init__.py | 2 +- homeharvest/core/scrapers/redfin/__init__.py | 4 ++++ homeharvest/core/scrapers/zillow/__init__.py | 5 +---- tests/test_realtor.py | 15 +++++++++++++++ tests/test_redfin.py | 15 +++++++++++++++ tests/test_zillow.py | 15 +++++++++++++++ 7 files changed, 64 insertions(+), 6 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index e2f7f2a..a774fc3 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -17,6 +17,7 @@ _scrapers = { "zillow": ZillowScraper, } + def validate_input(site_name: str, listing_type: str) -> None: if site_name.lower() not in _scrapers: raise InvalidSite(f"Provided site, '{site_name}', does not exist.") @@ -26,6 +27,7 @@ def validate_input(site_name: str, listing_type: str) -> None: f"Provided listing type, '{listing_type}', does not exist." ) + def get_ordered_properties(result: Property) -> list[str]: return [ "property_url", @@ -65,6 +67,7 @@ def get_ordered_properties(result: Property) -> list[str]: "longitude", ] + def process_result(result: Property) -> pd.DataFrame: prop_data = result.__dict__ @@ -90,6 +93,7 @@ def process_result(result: Property) -> pd.DataFrame: return properties_df + def _scrape_single_site( location: str, site_name: str, listing_type: str ) -> pd.DataFrame: @@ -157,5 +161,13 @@ def scrape_property( return pd.DataFrame() final_df = pd.concat(results, ignore_index=True) + + columns_to_track = ["street_address", "city", "unit"] + + #: validate they exist, otherwise create them + for col in columns_to_track: + if col not in final_df.columns: + final_df[col] = None + final_df = final_df.drop_duplicates(subset=["street_address", "city", "unit"], keep="first") - return final_df \ No newline at end of file + return final_df diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index f6cd68d..b574f5d 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -44,7 +44,7 @@ class RealtorScraper(Scraper): result = response_json["autocomplete"] - if result is None: + if not result: raise NoResultsFound("No results found for location: " + self.location) return result[0] diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index d701ff2..b929ed6 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -3,6 +3,7 @@ from typing import Any from .. import Scraper from ....utils import parse_address_two, parse_unit from ..models import Property, Address, PropertyType +from ....exceptions import NoResultsFound class RedfinScraper(Scraper): @@ -26,6 +27,9 @@ class RedfinScraper(Scraper): elif match_type == "1": return "address" #: address, needs to be handled differently + if "exactMatch" not in response_json['payload']: + raise NoResultsFound("No results found for location: {}".format(self.location)) + if response_json["payload"]["exactMatch"] is not None: target = response_json["payload"]["exactMatch"] else: diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 3cfb5cb..d6c92d4 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -2,7 +2,7 @@ import re import json from .. import Scraper from ....utils import parse_address_two, parse_unit -from ....exceptions import GeoCoordsNotFound +from ....exceptions import GeoCoordsNotFound, NoResultsFound from ..models import Property, Address, ListingType, PropertyType, SiteName @@ -151,9 +151,6 @@ class ZillowScraper(Scraper): else None, "currency": home_info["currency"], "price": home_info.get("price"), - "square_feet": int(home_info["livingArea"]) - if "livingArea" in home_info - else None, "tax_assessed_value": int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None, diff --git a/tests/test_realtor.py b/tests/test_realtor.py index b9525c9..16926f8 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -1,4 +1,5 @@ from homeharvest import scrape_property +from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound def test_realtor(): @@ -18,3 +19,17 @@ def test_realtor(): ] assert all([result is not None for result in results]) + + bad_results = [] + try: + bad_results += [ + scrape_property( + location="abceefg ju098ot498hh9", + site_name="realtor.com", + listing_type="for_sale", + ) + ] + except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound): + assert True + + assert all([result is None for result in bad_results]) diff --git a/tests/test_redfin.py b/tests/test_redfin.py index bc7912a..84995ff 100644 --- a/tests/test_redfin.py +++ b/tests/test_redfin.py @@ -1,4 +1,5 @@ from homeharvest import scrape_property +from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound def test_redfin(): @@ -16,3 +17,17 @@ def test_redfin(): ] assert all([result is not None for result in results]) + + bad_results = [] + try: + bad_results += [ + scrape_property( + location="abceefg ju098ot498hh9", + site_name="redfin", + listing_type="for_sale", + ) + ] + except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound): + assert True + + assert all([result is None for result in bad_results]) diff --git a/tests/test_zillow.py b/tests/test_zillow.py index b99b6d6..ddb7d8e 100644 --- a/tests/test_zillow.py +++ b/tests/test_zillow.py @@ -1,4 +1,5 @@ from homeharvest import scrape_property +from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound def test_zillow(): @@ -16,3 +17,17 @@ def test_zillow(): ] assert all([result is not None for result in results]) + + bad_results = [] + try: + bad_results += [ + scrape_property( + location="abceefg ju098ot498hh9", + site_name="zillow", + listing_type="for_sale", + ) + ] + except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound): + assert True + + assert all([result is None for result in bad_results])