diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index c3ec0d3..27ea8cb 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -70,7 +70,10 @@ def process_result(result: Property) -> pd.DataFrame: prop_data["site_name"] = prop_data["site_name"].value prop_data["listing_type"] = prop_data["listing_type"].value.lower() - prop_data["property_type"] = prop_data["property_type"].value.lower() + if "property_type" in prop_data and prop_data["property_type"] is not None: + prop_data["property_type"] = prop_data["property_type"].value.lower() + else: + prop_data["property_type"] = None if "address" in prop_data: address_data = prop_data["address"] prop_data["street_address"] = address_data.street_address @@ -108,7 +111,7 @@ def scrape_property( scraper_input = ScraperInput( location=location, listing_type=ListingType[listing_type.upper()], - site_name=SiteName[site_name.upper()], + site_name=SiteName.get_by_value(site_name.lower()), ) site = _scrapers[site_name.lower()](scraper_input) diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index b08ac69..8385405 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -7,6 +7,13 @@ class SiteName(Enum): REDFIN = "redfin" REALTOR = "realtor.com" + @classmethod + def get_by_value(cls, value): + for item in cls: + if item.value == value: + return item + raise ValueError(f"{value} not found in {cls}") + class ListingType(Enum): FOR_SALE = "FOR_SALE" @@ -57,14 +64,13 @@ class Address: country: str | None = None - @dataclass class Property: property_url: str site_name: SiteName listing_type: ListingType - property_type: PropertyType address: Address + property_type: PropertyType | None = None # house for sale price: int | None = None @@ -78,7 +84,6 @@ class Property: stories: int | None = None year_built: int | None = None price_per_sqft: int | None = None - year_built: int | None = None mls_id: str | None = None agent_name: str | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index d3660f6..f6490f6 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -3,6 +3,7 @@ from ..models import Property, Address from .. import Scraper from typing import Any, Generator from ....exceptions import NoResultsFound +from ....utils import parse_address_two from concurrent.futures import ThreadPoolExecutor, as_completed @@ -29,7 +30,7 @@ class RealtorScraper(Scraper): params = { "input": self.location, - "client_id": self.listing_type.value.replace('_', '-'), + "client_id": self.listing_type.value.lower().replace("_", "-"), "limit": "1", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", } @@ -96,46 +97,57 @@ class RealtorScraper(Scraper): } }""" - variables = { - 'property_id': property_id - } + variables = {"property_id": property_id} payload = { - 'query': query, - 'variables': variables, + "query": query, + "variables": variables, } response = self.session.post(self.search_url, json=payload) response_json = response.json() - property_info = response_json['data']['property'] + property_info = response_json["data"]["property"] + street_address = property_info["address"]["line"] + unit = parse_address_two(street_address) - return [Property( - site_name=self.site_name, - address=Address( - address_one=property_info['address']['line'], - city=property_info['address']['city'], - state=property_info['address']['state_code'], - zip_code=property_info['address']['postal_code'], - ), - url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'], - beds=property_info['basic']['beds'], - baths=property_info['basic']['baths'], - stories=property_info['details']['stories'], - year_built=property_info['details']['year_built'], - square_feet=property_info['basic']['sqft'], - price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft'] - if property_info['basic']['sqft'] is not None and - property_info['basic']['price'] is not None - else None, - price=property_info['basic']['price'], - mls_id=property_id, - listing_type=self.listing_type, - lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None, - )] + return [ + Property( + site_name=self.site_name, + address=Address( + street_address=street_address, + city=property_info["address"]["city"], + state=property_info["address"]["state_code"], + zip_code=property_info["address"]["postal_code"], + unit=unit, + country="USA", + ), + property_url="https://www.realtor.com/realestateandhomes-detail/" + + property_info["details"]["permalink"], + beds=property_info["basic"]["beds"], + baths=property_info["basic"]["baths"], + stories=property_info["details"]["stories"], + year_built=property_info["details"]["year_built"], + square_feet=property_info["basic"]["sqft"], + price_per_sqft=property_info["basic"]["price"] + // property_info["basic"]["sqft"] + if property_info["basic"]["sqft"] is not None + and property_info["basic"]["price"] is not None + else None, + price=property_info["basic"]["price"], + mls_id=property_id, + listing_type=self.listing_type, + lot_area_value=property_info["public_record"]["lot_size"] + if property_info["public_record"] is not None + else None, + ) + ] - def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: - query = """query Home_search( + def handle_area( + self, variables: dict, return_total: bool = False + ) -> list[Property] | int: + query = ( + """query Home_search( $city: String, $county: [String], $state_code: String, @@ -193,42 +205,57 @@ class RealtorScraper(Scraper): } } } - }""" % self.listing_type.value + }""" + % self.listing_type.value.lower() + ) payload = { - 'query': query, - 'variables': variables, + "query": query, + "variables": variables, } response = self.session.post(self.search_url, json=payload) + response.raise_for_status() response_json = response.json() if return_total: - return response_json['data']['home_search']['total'] + return response_json["data"]["home_search"]["total"] properties: list[Property] = [] - for result in response_json['data']['home_search']['results']: + if ( + response_json is None + or "data" not in response_json + or response_json["data"] is None + or "home_search" not in response_json["data"] + or response_json["data"]["home_search"] is None + or "results" not in response_json["data"]["home_search"] + ): + return [] + + for result in response_json["data"]["home_search"]["results"]: realty_property = Property( address=Address( - address_one=result['location']['address']['line'], - city=result['location']['address']['city'], - state=result['location']['address']['state_code'], - zip_code=result['location']['address']['postal_code'], - address_two=result['location']['address']['unit'], + street_address=result["location"]["address"]["line"], + city=result["location"]["address"]["city"], + state=result["location"]["address"]["state_code"], + zip_code=result["location"]["address"]["postal_code"], + unit=result["location"]["address"]["unit"], + country="USA", ), site_name=self.site_name, - url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'], - beds=result['description']['beds'], - baths=result['description']['baths'], - stories=result['description']['stories'], - year_built=result['description']['year_built'], - square_feet=result['description']['sqft'], - price_per_square_foot=result['price_per_sqft'], - price=result['list_price'], - mls_id=result['property_id'], + property_url="https://www.realtor.com/realestateandhomes-detail/" + + result["property_id"], + beds=result["description"]["beds"], + baths=result["description"]["baths"], + stories=result["description"]["stories"], + year_built=result["description"]["year_built"], + square_feet=result["description"]["sqft"], + price_per_sqft=result["price_per_sqft"], + price=result["list_price"], + mls_id=result["property_id"], listing_type=self.listing_type, - lot_size=result['description']['lot_sqft'], + lot_area_value=result["description"]["lot_sqft"], ) properties.append(realty_property) @@ -239,17 +266,17 @@ class RealtorScraper(Scraper): location_info = self.handle_location() location_type = location_info["area_type"] - if location_type == 'address': - property_id = location_info['mpr_id'] + if location_type == "address": + property_id = location_info["mpr_id"] return self.handle_address(property_id) offset = 0 search_variables = { - 'city': location_info.get('city'), - 'county': location_info.get('county'), - 'state_code': location_info.get('state_code'), - 'postal_code': location_info.get('postal_code'), - 'offset': offset, + "city": location_info.get("city"), + "county": location_info.get("county"), + "state_code": location_info.get("state_code"), + "postal_code": location_info.get("postal_code"), + "offset": offset, } total = self.handle_area(search_variables, return_total=True) @@ -258,8 +285,11 @@ class RealtorScraper(Scraper): with ThreadPoolExecutor(max_workers=10) as executor: futures = [ executor.submit( - self.handle_area, variables=search_variables | {'offset': i}, return_total=False - ) for i in range(0, total, 200) + self.handle_area, + variables=search_variables | {"offset": i}, + return_total=False, + ) + for i in range(0, total, 200) ] for future in as_completed(futures): diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index bec2cce..5350460 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -100,28 +100,27 @@ class RedfinScraper(Scraper): address=Address( street_address=" ".join( [ - building['address']['streetNumber'], - building['address']['directionalPrefix'], - building['address']['streetName'], - building['address']['streetType'], + building["address"]["streetNumber"], + building["address"]["directionalPrefix"], + building["address"]["streetName"], + building["address"]["streetType"], ] ), - city=building['address']['city'], - state=building['address']['stateOrProvinceCode'], - zip_code=building['address']['postalCode'], + city=building["address"]["city"], + state=building["address"]["stateOrProvinceCode"], + zip_code=building["address"]["postalCode"], unit=" ".join( [ - building['address']['unitType'], - building['address']['unitValue'], + building["address"]["unitType"], + building["address"]["unitValue"], ] - ) + ), ), property_url="https://www.redfin.com{}".format(building["url"]), listing_type=self.listing_type, bldg_unit_count=building["numUnitsForSale"], ) - def handle_address(self, home_id: str): """ EPs: @@ -160,7 +159,8 @@ class RedfinScraper(Scraper): homes = [ self._parse_home(home) for home in response_json["payload"]["homes"] ] + [ - self._parse_building(building) for building in response_json["payload"]["buildings"].values() + self._parse_building(building) + for building in response_json["payload"]["buildings"].values() ] return homes diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 4aa60a7..0b4d889 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -98,26 +98,24 @@ class ZillowScraper(Scraper): else filter_state_sold ) - payload = json.dumps( - { - "searchQueryState": { - "pagination": {}, - "isMapVisible": True, - "mapBounds": { - "west": coords[0], - "east": coords[1], - "south": coords[2], - "north": coords[3], - }, - "filterState": selected_filter, - "isListVisible": True, - "mapZoom": 11, + payload = { + "searchQueryState": { + "pagination": {}, + "isMapVisible": True, + "mapBounds": { + "west": coords[0], + "east": coords[1], + "south": coords[2], + "north": coords[3], }, - "wants": {"cat1": ["mapResults"]}, - "isDebugRequest": False, - } - ) - resp = self.session.put(url, headers=self._get_headers(), data=payload) + "filterState": selected_filter, + "isListVisible": True, + "mapZoom": 11, + }, + "wants": {"cat1": ["mapResults"]}, + "isDebugRequest": False, + } + resp = self.session.put(url, headers=self._get_headers(), json=payload) resp.raise_for_status() a = resp.json() return self._parse_properties(resp.json()) @@ -176,9 +174,7 @@ class ZillowScraper(Scraper): and result["variableData"]["type"] == "TIME_ON_INFO" else None, "img_src": result.get("imgSrc"), - "price_per_sqft": int( - home_info["price"] // home_info["livingArea"] - ) + "price_per_sqft": int(home_info["price"] // home_info["livingArea"]) if "livingArea" in home_info and "price" in home_info else None, } diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 291eb12..24d5281 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -3,9 +3,17 @@ from homeharvest import scrape_property def test_realtor(): results = [ - scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"), - scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format - scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format + scrape_property( + location="2530 Al Lipscomb Way", + site_name="realtor.com", + listing_type="for_sale", + ), + scrape_property( + location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent" + ), #: does not support "city, state, USA" format + scrape_property( + location="Dallas, TX", site_name="realtor.com", listing_type="sold" + ), #: does not support "city, state, USA" format scrape_property(location="85281", site_name="realtor.com"), ] diff --git a/tests/test_redfin.py b/tests/test_redfin.py index 78fa541..575d1b4 100644 --- a/tests/test_redfin.py +++ b/tests/test_redfin.py @@ -3,9 +3,15 @@ from homeharvest import scrape_property def test_redfin(): results = [ - scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"), - scrape_property(location="Phoenix, AZ, USA", site_name="redfin"), - scrape_property(location="Dallas, TX, USA", site_name="redfin"), + scrape_property( + location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale" + ), + scrape_property( + location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent" + ), + scrape_property( + location="Dallas, TX, USA", site_name="redfin", listing_type="sold" + ), scrape_property(location="85281", site_name="redfin"), ] diff --git a/tests/test_zillow.py b/tests/test_zillow.py index d9a56dc..38c3114 100644 --- a/tests/test_zillow.py +++ b/tests/test_zillow.py @@ -3,9 +3,15 @@ from homeharvest import scrape_property def test_zillow(): results = [ - scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"), - scrape_property(location="Phoenix, AZ, USA", site_name="zillow"), - scrape_property(location="Dallas, TX, USA", site_name="zillow"), + scrape_property( + location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale" + ), + scrape_property( + location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent" + ), + scrape_property( + location="Dallas, TX, USA", site_name="zillow", listing_type="sold" + ), scrape_property(location="85281", site_name="zillow"), ]