diff --git a/README.md b/README.md index 233f38f..17982cd 100644 --- a/README.md +++ b/README.md @@ -2,34 +2,41 @@ **HomeHarvest** aims to be the top Python real estate scraping library. -## RoadMap +_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._ -- **Supported Sites**: Currently, we support scraping from sites such as `Zillow` and `RedFin`. -- **Output**: Provides the option to return the scraped data as a Pandas dataframe. -- **Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience. +[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo) -## Site Name Options - -- `zillow` -- `redfin` - -## Listing Types - -- `for_rent` -- `for_sale` - -### Installation +## Installation ```bash pip install --upgrade homeharvest ``` -### Example Usage -``` -from homeharvest import scrape_property +## Example Usage +```py +>>> from homeharvest import scrape_property +... properties = scrape_property( +... location="85281", site_name="zillow", listing_type="for_rent" +... ) -properties = scrape_property( - location="85281", site_name="zillow", listing_type="for_rent" -) -print(properties) +>>> properties.head() + address_one city ... mls_id description +0 420 N Scottsdale Rd Tempe ... NaN NaN +1 1255 E University Dr Tempe ... NaN NaN +2 1979 E Rio Salado Pkwy Tempe ... NaN NaN +3 548 S Wilson St Tempe ... None None +4 945 E Playa Del Norte Dr Unit 4027 Tempe ... NaN NaN +[5 rows x 23 columns] ``` + +### Site Name Options + +- `zillow` +- `redfin` +- `realtor.com` + +### Listing Types + +- `for_rent` +- `for_sale` +- `sold` diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 009aee6..c3ec0d3 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -92,7 +92,17 @@ def scrape_property( location: str, site_name: str, listing_type: str = "for_sale", #: for_sale, for_rent, sold -) -> list[Property]: +) -> pd.DataFrame: + """ + Scrape property from various sites from a given location and listing type. + + :returns: pd.DataFrame + :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') + :param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin') + :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold') + :return: pd.DataFrame containing properties + """ + validate_input(site_name, listing_type) scraper_input = ScraperInput( diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 873bf76..3e2c25a 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -14,6 +14,8 @@ class ScraperInput: class Scraper: def __init__(self, scraper_input: ScraperInput): self.location = scraper_input.location + self.listing_type = scraper_input.listing_type + self.session = requests.Session() self.listing_type = scraper_input.listing_type self.site_name = scraper_input.site_name diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 6ae6955..b08ac69 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -57,6 +57,7 @@ class Address: country: str | None = None + @dataclass class Property: property_url: str diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 8e4fbd8..d3660f6 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -1,12 +1,15 @@ import json from ..models import Property, Address from .. import Scraper -from typing import Any +from typing import Any, Generator +from ....exceptions import NoResultsFound +from concurrent.futures import ThreadPoolExecutor, as_completed class RealtorScraper(Scraper): def __init__(self, scraper_input): super().__init__(scraper_input) + self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta" def handle_location(self): headers = { @@ -26,7 +29,7 @@ class RealtorScraper(Scraper): params = { "input": self.location, - "client_id": "for-sale", + "client_id": self.listing_type.value.replace('_', '-'), "limit": "1", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", } @@ -38,14 +41,228 @@ class RealtorScraper(Scraper): ) response_json = response.json() - return response_json["autocomplete"][0] + result = response_json["autocomplete"] + + if result is None: + raise NoResultsFound("No results found for location: " + self.location) + + return result[0] + + def handle_address(self, property_id: str) -> list[Property]: + query = """query Property($property_id: ID!) { + property(id: $property_id) { + property_id + details { + date_updated + garage + permalink + year_built + stories + } + address { + address_validation_code + city + country + county + line + postal_code + state_code + street_direction + street_name + street_number + street_suffix + street_post_direction + unit_value + unit + unit_descriptor + zip + } + basic { + baths + beds + price + sqft + lot_sqft + type + sold_price + } + public_record { + lot_size + sqft + stories + units + year_built + } + } + }""" + + variables = { + 'property_id': property_id + } + + payload = { + 'query': query, + 'variables': variables, + } + + response = self.session.post(self.search_url, json=payload) + response_json = response.json() + + property_info = response_json['data']['property'] + + return [Property( + site_name=self.site_name, + address=Address( + address_one=property_info['address']['line'], + city=property_info['address']['city'], + state=property_info['address']['state_code'], + zip_code=property_info['address']['postal_code'], + ), + url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'], + beds=property_info['basic']['beds'], + baths=property_info['basic']['baths'], + stories=property_info['details']['stories'], + year_built=property_info['details']['year_built'], + square_feet=property_info['basic']['sqft'], + price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft'] + if property_info['basic']['sqft'] is not None and + property_info['basic']['price'] is not None + else None, + price=property_info['basic']['price'], + mls_id=property_id, + listing_type=self.listing_type, + lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None, + )] + + def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: + query = """query Home_search( + $city: String, + $county: [String], + $state_code: String, + $postal_code: String + $offset: Int, + ) { + home_search( + query: { + city: $city + county: $county + postal_code: $postal_code + state_code: $state_code + status: %s + } + limit: 200 + offset: $offset + ) { + count + total + results { + property_id + description { + baths + beds + lot_sqft + sqft + text + sold_price + stories + year_built + garage + unit_number + floor_number + } + location { + address { + city + country + line + postal_code + state_code + state + street_direction + street_name + street_number + street_post_direction + street_suffix + unit + } + } + list_price + price_per_sqft + source { + id + } + } + } + }""" % self.listing_type.value + + payload = { + 'query': query, + 'variables': variables, + } + + response = self.session.post(self.search_url, json=payload) + response_json = response.json() + + if return_total: + return response_json['data']['home_search']['total'] + + properties: list[Property] = [] + + for result in response_json['data']['home_search']['results']: + realty_property = Property( + address=Address( + address_one=result['location']['address']['line'], + city=result['location']['address']['city'], + state=result['location']['address']['state_code'], + zip_code=result['location']['address']['postal_code'], + address_two=result['location']['address']['unit'], + ), + site_name=self.site_name, + url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'], + beds=result['description']['beds'], + baths=result['description']['baths'], + stories=result['description']['stories'], + year_built=result['description']['year_built'], + square_feet=result['description']['sqft'], + price_per_square_foot=result['price_per_sqft'], + price=result['list_price'], + mls_id=result['property_id'], + listing_type=self.listing_type, + lot_size=result['description']['lot_sqft'], + ) + + properties.append(realty_property) + + return properties def search(self): location_info = self.handle_location() location_type = location_info["area_type"] - """ - property types: - apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes - """ - print("a") + if location_type == 'address': + property_id = location_info['mpr_id'] + return self.handle_address(property_id) + + offset = 0 + search_variables = { + 'city': location_info.get('city'), + 'county': location_info.get('county'), + 'state_code': location_info.get('state_code'), + 'postal_code': location_info.get('postal_code'), + 'offset': offset, + } + + total = self.handle_area(search_variables, return_total=True) + + homes = [] + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [ + executor.submit( + self.handle_area, variables=search_variables | {'offset': i}, return_total=False + ) for i in range(0, total, 200) + ] + + for future in as_completed(futures): + homes.extend(future.result()) + + return homes diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index f1d9c29..bec2cce 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -93,6 +93,35 @@ class RedfinScraper(Scraper): mls_id=get_value("mlsId"), ) + def _parse_building(self, building: dict) -> Property: + return Property( + site_name=self.site_name, + property_type=PropertyType("BUILDING"), + address=Address( + street_address=" ".join( + [ + building['address']['streetNumber'], + building['address']['directionalPrefix'], + building['address']['streetName'], + building['address']['streetType'], + ] + ), + city=building['address']['city'], + state=building['address']['stateOrProvinceCode'], + zip_code=building['address']['postalCode'], + unit=" ".join( + [ + building['address']['unitType'], + building['address']['unitValue'], + ] + ) + ), + property_url="https://www.redfin.com{}".format(building["url"]), + listing_type=self.listing_type, + bldg_unit_count=building["numUnitsForSale"], + ) + + def handle_address(self, home_id: str): """ EPs: @@ -130,5 +159,8 @@ class RedfinScraper(Scraper): homes = [ self._parse_home(home) for home in response_json["payload"]["homes"] - ] #: support buildings + ] + [ + self._parse_building(building) for building in response_json["payload"]["buildings"].values() + ] + return homes diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 6c36196..4aa60a7 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -117,11 +117,10 @@ class ZillowScraper(Scraper): "isDebugRequest": False, } ) - print(payload) resp = self.session.put(url, headers=self._get_headers(), data=payload) resp.raise_for_status() a = resp.json() - return parse_properties(resp.json()) + return self._parse_properties(resp.json()) def _parse_properties(self, property_data: dict): mapresults = property_data["cat1"]["searchResults"]["mapResults"] @@ -129,98 +128,92 @@ class ZillowScraper(Scraper): properties_list = [] for result in mapresults: - try: - if "hdpData" in result: - home_info = result["hdpData"]["homeInfo"] - address_data = { - "street_address": home_info["streetAddress"], - "unit": home_info.get("unit"), - "city": home_info["city"], - "state": home_info["state"], - "zip_code": home_info["zipcode"], - "country": home_info["country"], - } - property_data = { - "site_name": self.site_name, - "address": Address(**address_data), - "property_url": f"https://www.zillow.com{result['detailUrl']}", - "beds": int(home_info["bedrooms"]) - if "bedrooms" in home_info - else None, - "baths": home_info.get("bathrooms"), - "square_feet": int(home_info["livingArea"]) - if "livingArea" in home_info - else None, - "currency": home_info["currency"], - "price": home_info.get("price"), - "square_feet": int(home_info["livingArea"]) - if "livingArea" in home_info - else None, - "tax_assessed_value": int(home_info["taxAssessedValue"]) - if "taxAssessedValue" in home_info - else None, - "property_type": PropertyType(home_info["homeType"]), - "listing_type": ListingType( - home_info["statusType"] - if "statusType" in home_info - else self.listing_type - ), - "lot_area_value": round(home_info["lotAreaValue"], 2) - if "lotAreaValue" in home_info - else None, - "lot_area_unit": home_info.get("lotAreaUnit"), - "latitude": result["latLong"]["latitude"], - "longitude": result["latLong"]["longitude"], - "status_text": result.get("statusText"), - "posted_time": result["variableData"]["text"] - if "variableData" in result - and "text" in result["variableData"] - and result["variableData"]["type"] == "TIME_ON_INFO" - else None, - "img_src": result.get("imgSrc"), - "price_per_sqft": int( - home_info["price"] // home_info["livingArea"] - ) - if "livingArea" in home_info and "price" in home_info - else None, - } - property_obj = Property(**property_data) - properties_list.append(property_obj) + if "hdpData" in result: + home_info = result["hdpData"]["homeInfo"] + address_data = { + "street_address": home_info["streetAddress"], + "unit": home_info.get("unit"), + "city": home_info["city"], + "state": home_info["state"], + "zip_code": home_info["zipcode"], + "country": home_info["country"], + } + property_data = { + "site_name": self.site_name, + "address": Address(**address_data), + "property_url": f"https://www.zillow.com{result['detailUrl']}", + "beds": int(home_info["bedrooms"]) + if "bedrooms" in home_info + else None, + "baths": home_info.get("bathrooms"), + "square_feet": int(home_info["livingArea"]) + if "livingArea" in home_info + else None, + "currency": home_info["currency"], + "price": home_info.get("price"), + "square_feet": int(home_info["livingArea"]) + if "livingArea" in home_info + else None, + "tax_assessed_value": int(home_info["taxAssessedValue"]) + if "taxAssessedValue" in home_info + else None, + "property_type": PropertyType(home_info["homeType"]), + "listing_type": ListingType( + home_info["statusType"] + if "statusType" in home_info + else self.listing_type + ), + "lot_area_value": round(home_info["lotAreaValue"], 2) + if "lotAreaValue" in home_info + else None, + "lot_area_unit": home_info.get("lotAreaUnit"), + "latitude": result["latLong"]["latitude"], + "longitude": result["latLong"]["longitude"], + "status_text": result.get("statusText"), + "posted_time": result["variableData"]["text"] + if "variableData" in result + and "text" in result["variableData"] + and result["variableData"]["type"] == "TIME_ON_INFO" + else None, + "img_src": result.get("imgSrc"), + "price_per_sqft": int( + home_info["price"] // home_info["livingArea"] + ) + if "livingArea" in home_info and "price" in home_info + else None, + } + property_obj = Property(**property_data) + properties_list.append(property_obj) - elif "isBuilding" in result: - price = result["price"] - building_data = { - "property_url": f"https://www.zillow.com{result['detailUrl']}", - "site_name": self.site_name, - "property_type": PropertyType("BUILDING"), - "listing_type": ListingType(result["statusType"]), - "img_src": result["imgSrc"], - "price": int(price.replace("From $", "").replace(",", "")) - if "From $" in price - else None, - "apt_min_price": int( - price.replace("$", "").replace(",", "").replace("+/mo", "") - ) - if "+/mo" in price - else None, - "address": self._extract_address(result["address"]), - "bldg_min_beds": result["minBeds"], - "currency": "USD", - "bldg_min_baths": result["minBaths"], - "bldg_min_area": result.get("minArea"), - "bldg_unit_count": result["unitCount"], - "bldg_name": result.get("communityName"), - "status_text": result["statusText"], - "latitude": result["latLong"]["latitude"], - "longitude": result["latLong"]["longitude"], - } - building_obj = Property(**building_data) - properties_list.append(building_obj) - - except Exception as e: - print(home_info) - traceback.print_exc() - sys.exit() + elif "isBuilding" in result: + price = result["price"] + building_data = { + "property_url": f"https://www.zillow.com{result['detailUrl']}", + "site_name": self.site_name, + "property_type": PropertyType("BUILDING"), + "listing_type": ListingType(result["statusType"]), + "img_src": result["imgSrc"], + "price": int(price.replace("From $", "").replace(",", "")) + if "From $" in price + else None, + "apt_min_price": int( + price.replace("$", "").replace(",", "").replace("+/mo", "") + ) + if "+/mo" in price + else None, + "address": self._extract_address(result["address"]), + "bldg_min_beds": result["minBeds"], + "currency": "USD", + "bldg_min_baths": result["minBaths"], + "bldg_min_area": result.get("minArea"), + "bldg_unit_count": result["unitCount"], + "bldg_name": result.get("communityName"), + "status_text": result["statusText"], + "latitude": result["latLong"]["latitude"], + "longitude": result["latLong"]["longitude"], + } + building_obj = Property(**building_data) + properties_list.append(building_obj) return properties_list diff --git a/pyproject.toml b/pyproject.toml index 0f1198a..3ecd478 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.1.2" +version = "0.1.3" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 2649177..291eb12 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -3,6 +3,9 @@ from homeharvest import scrape_property def test_realtor(): results = [ + scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"), + scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format + scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format scrape_property(location="85281", site_name="realtor.com"), ]