From ba9fe806a7240e4e7df09a0defd7935e7a4ff4e9 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:16:59 -0700 Subject: [PATCH 1/8] - finished realtor --- homeharvest/__init__.py | 8 +- homeharvest/core/scrapers/__init__.py | 4 +- homeharvest/core/scrapers/models.py | 3 +- homeharvest/core/scrapers/realtor/__init__.py | 233 +++++++++++++++++- tests/test_realtor.py | 3 + 5 files changed, 236 insertions(+), 15 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index f817806..4afbbc5 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -69,9 +69,9 @@ def process_result(result: Union[Building, Property]) -> pd.DataFrame: prop_data = result.__dict__ address_data = prop_data["address"] - prop_data["site_name"] = prop_data["site_name"].value + prop_data["site_name"] = prop_data["site_name"] prop_data["listing_type"] = prop_data["listing_type"].value - prop_data["property_type"] = prop_data["property_type"].value.lower() + prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data["property_type"] else None prop_data["address_one"] = address_data.address_one prop_data["city"] = address_data.city prop_data["state"] = address_data.state @@ -90,13 +90,13 @@ def scrape_property( location: str, site_name: str, listing_type: str = "for_sale", #: for_sale, for_rent, sold -) -> Union[list[Building], list[Property]]: +) -> pd.DataFrame: validate_input(site_name, listing_type) scraper_input = ScraperInput( location=location, listing_type=ListingType[listing_type.upper()], - site_name=SiteName[site_name.upper()], + site_name=site_name.lower(), ) site = _scrapers[site_name.lower()](scraper_input) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 873bf76..e985eec 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -7,13 +7,15 @@ from .models import Property, ListingType, SiteName class ScraperInput: location: str listing_type: ListingType - site_name: SiteName + site_name: str proxy_url: str | None = None class Scraper: def __init__(self, scraper_input: ScraperInput): self.location = scraper_input.location + self.listing_type = scraper_input.listing_type + self.session = requests.Session() self.listing_type = scraper_input.listing_type self.site_name = scraper_input.site_name diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 1a3db97..b715fbd 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -53,7 +53,7 @@ class Address: @dataclass() class Realty: - site_name: SiteName + site_name: str address: Address url: str listing_type: ListingType | None = None @@ -68,7 +68,6 @@ class Property(Realty): year_built: int | None = None square_feet: int | None = None price_per_square_foot: int | None = None - year_built: int | None = None mls_id: str | None = None agent_name: str | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 8e4fbd8..d3660f6 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -1,12 +1,15 @@ import json from ..models import Property, Address from .. import Scraper -from typing import Any +from typing import Any, Generator +from ....exceptions import NoResultsFound +from concurrent.futures import ThreadPoolExecutor, as_completed class RealtorScraper(Scraper): def __init__(self, scraper_input): super().__init__(scraper_input) + self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta" def handle_location(self): headers = { @@ -26,7 +29,7 @@ class RealtorScraper(Scraper): params = { "input": self.location, - "client_id": "for-sale", + "client_id": self.listing_type.value.replace('_', '-'), "limit": "1", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", } @@ -38,14 +41,228 @@ class RealtorScraper(Scraper): ) response_json = response.json() - return response_json["autocomplete"][0] + result = response_json["autocomplete"] + + if result is None: + raise NoResultsFound("No results found for location: " + self.location) + + return result[0] + + def handle_address(self, property_id: str) -> list[Property]: + query = """query Property($property_id: ID!) { + property(id: $property_id) { + property_id + details { + date_updated + garage + permalink + year_built + stories + } + address { + address_validation_code + city + country + county + line + postal_code + state_code + street_direction + street_name + street_number + street_suffix + street_post_direction + unit_value + unit + unit_descriptor + zip + } + basic { + baths + beds + price + sqft + lot_sqft + type + sold_price + } + public_record { + lot_size + sqft + stories + units + year_built + } + } + }""" + + variables = { + 'property_id': property_id + } + + payload = { + 'query': query, + 'variables': variables, + } + + response = self.session.post(self.search_url, json=payload) + response_json = response.json() + + property_info = response_json['data']['property'] + + return [Property( + site_name=self.site_name, + address=Address( + address_one=property_info['address']['line'], + city=property_info['address']['city'], + state=property_info['address']['state_code'], + zip_code=property_info['address']['postal_code'], + ), + url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'], + beds=property_info['basic']['beds'], + baths=property_info['basic']['baths'], + stories=property_info['details']['stories'], + year_built=property_info['details']['year_built'], + square_feet=property_info['basic']['sqft'], + price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft'] + if property_info['basic']['sqft'] is not None and + property_info['basic']['price'] is not None + else None, + price=property_info['basic']['price'], + mls_id=property_id, + listing_type=self.listing_type, + lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None, + )] + + def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: + query = """query Home_search( + $city: String, + $county: [String], + $state_code: String, + $postal_code: String + $offset: Int, + ) { + home_search( + query: { + city: $city + county: $county + postal_code: $postal_code + state_code: $state_code + status: %s + } + limit: 200 + offset: $offset + ) { + count + total + results { + property_id + description { + baths + beds + lot_sqft + sqft + text + sold_price + stories + year_built + garage + unit_number + floor_number + } + location { + address { + city + country + line + postal_code + state_code + state + street_direction + street_name + street_number + street_post_direction + street_suffix + unit + } + } + list_price + price_per_sqft + source { + id + } + } + } + }""" % self.listing_type.value + + payload = { + 'query': query, + 'variables': variables, + } + + response = self.session.post(self.search_url, json=payload) + response_json = response.json() + + if return_total: + return response_json['data']['home_search']['total'] + + properties: list[Property] = [] + + for result in response_json['data']['home_search']['results']: + realty_property = Property( + address=Address( + address_one=result['location']['address']['line'], + city=result['location']['address']['city'], + state=result['location']['address']['state_code'], + zip_code=result['location']['address']['postal_code'], + address_two=result['location']['address']['unit'], + ), + site_name=self.site_name, + url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'], + beds=result['description']['beds'], + baths=result['description']['baths'], + stories=result['description']['stories'], + year_built=result['description']['year_built'], + square_feet=result['description']['sqft'], + price_per_square_foot=result['price_per_sqft'], + price=result['list_price'], + mls_id=result['property_id'], + listing_type=self.listing_type, + lot_size=result['description']['lot_sqft'], + ) + + properties.append(realty_property) + + return properties def search(self): location_info = self.handle_location() location_type = location_info["area_type"] - """ - property types: - apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes - """ - print("a") + if location_type == 'address': + property_id = location_info['mpr_id'] + return self.handle_address(property_id) + + offset = 0 + search_variables = { + 'city': location_info.get('city'), + 'county': location_info.get('county'), + 'state_code': location_info.get('state_code'), + 'postal_code': location_info.get('postal_code'), + 'offset': offset, + } + + total = self.handle_area(search_variables, return_total=True) + + homes = [] + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [ + executor.submit( + self.handle_area, variables=search_variables | {'offset': i}, return_total=False + ) for i in range(0, total, 200) + ] + + for future in as_completed(futures): + homes.extend(future.result()) + + return homes diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 2649177..291eb12 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -3,6 +3,9 @@ from homeharvest import scrape_property def test_realtor(): results = [ + scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"), + scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format + scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format scrape_property(location="85281", site_name="realtor.com"), ] From ba249ca20d4c7bc531c9e023a11672087eaa994f Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:26:35 -0700 Subject: [PATCH 2/8] - redfin buildings support --- homeharvest/__init__.py | 2 +- homeharvest/core/scrapers/redfin/__init__.py | 35 ++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 4afbbc5..2b53b13 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -71,7 +71,7 @@ def process_result(result: Union[Building, Property]) -> pd.DataFrame: address_data = prop_data["address"] prop_data["site_name"] = prop_data["site_name"] prop_data["listing_type"] = prop_data["listing_type"].value - prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data["property_type"] else None + prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data.get("property_type") else None prop_data["address_one"] = address_data.address_one prop_data["city"] = address_data.city prop_data["state"] = address_data.state diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index 29855a7..778a49f 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -1,5 +1,5 @@ import json -from ..models import Property, Address, PropertyType +from ..models import Property, Address, PropertyType, Building from .. import Scraper from typing import Any @@ -86,6 +86,34 @@ class RedfinScraper(Scraper): mls_id=get_value("mlsId"), ) + def _parse_building(self, building: dict) -> Building: + return Building( + address=Address( + address_one=" ".join( + [ + building['address']['streetNumber'], + building['address']['directionalPrefix'], + building['address']['streetName'], + building['address']['streetType'], + ] + ), + city=building['address']['city'], + state=building['address']['stateOrProvinceCode'], + zip_code=building['address']['postalCode'], + address_two=" ".join( + [ + building['address']['unitType'], + building['address']['unitValue'], + ] + ) + ), + site_name=self.site_name, + url="https://www.redfin.com{}".format(building["url"]), + listing_type=self.listing_type, + num_units=building["numUnitsForSale"], + ) + + def handle_address(self, home_id: str): """ EPs: @@ -123,5 +151,8 @@ class RedfinScraper(Scraper): homes = [ self._parse_home(home) for home in response_json["payload"]["homes"] - ] #: support buildings + ] + [ + self._parse_building(building) for building in response_json["payload"]["buildings"].values() + ] + return homes From 6b02394e954936cca60287e0aaf6dea87508313f Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:37:07 -0700 Subject: [PATCH 3/8] - scrape_property docstring --- homeharvest/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 2b53b13..8582148 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -91,6 +91,16 @@ def scrape_property( site_name: str, listing_type: str = "for_sale", #: for_sale, for_rent, sold ) -> pd.DataFrame: + """ + Scrape property from various sites from a given location and listing type. + + :returns: pd.DataFrame + :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') + :param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin') + :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold') + :return: pd.DataFrame containing properties + """ + validate_input(site_name, listing_type) scraper_input = ScraperInput( From 54af03c86a62ebfee9581daaf6f46e23e249eaa8 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:37:37 -0700 Subject: [PATCH 4/8] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 233f38f..ad4a4a5 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,13 @@ - `zillow` - `redfin` +- `realtor.com` ## Listing Types - `for_rent` - `for_sale` +- `sold` ### Installation From 29897b8fbe88ec0cb197f6687b2b646748617e05 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:38:56 -0700 Subject: [PATCH 5/8] Update README.md --- README.md | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index ad4a4a5..6bfa703 100644 --- a/README.md +++ b/README.md @@ -2,31 +2,15 @@ **HomeHarvest** aims to be the top Python real estate scraping library. -## RoadMap +_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._ -- **Supported Sites**: Currently, we support scraping from sites such as `Zillow` and `RedFin`. -- **Output**: Provides the option to return the scraped data as a Pandas dataframe. -- **Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience. - -## Site Name Options - -- `zillow` -- `redfin` -- `realtor.com` - -## Listing Types - -- `for_rent` -- `for_sale` -- `sold` - -### Installation +## Installation ```bash pip install --upgrade homeharvest ``` -### Example Usage +## Example Usage ``` from homeharvest import scrape_property @@ -35,3 +19,15 @@ properties = scrape_property( ) print(properties) ``` + +### Site Name Options + +- `zillow` +- `redfin` +- `realtor.com` + +### Listing Types + +- `for_rent` +- `for_sale` +- `sold` From c3c6bdd2c5b0c74fe5d6e58b86b5898a837671f3 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:39:34 -0700 Subject: [PATCH 6/8] - version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0f1198a..3ecd478 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.1.2" +version = "0.1.3" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest" From fd01bfb8b8e8817e2f641ea1bc124bae2e39ea4a Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:45:31 -0700 Subject: [PATCH 7/8] Update README.md --- README.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6bfa703..a362f5c 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,20 @@ pip install --upgrade homeharvest ``` ## Example Usage -``` -from homeharvest import scrape_property +```py +>>> from homeharvest import scrape_property +... properties = scrape_property( +... location="85281", site_name="zillow", listing_type="for_rent" +... ) -properties = scrape_property( - location="85281", site_name="zillow", listing_type="for_rent" -) -print(properties) +>>> properties.head() + address_one city ... mls_id description +0 420 N Scottsdale Rd Tempe ... NaN NaN +1 1255 E University Dr Tempe ... NaN NaN +2 1979 E Rio Salado Pkwy Tempe ... NaN NaN +3 548 S Wilson St Tempe ... None None +4 945 E Playa Del Norte Dr Unit 4027 Tempe ... NaN NaN +[5 rows x 23 columns] ``` ### Site Name Options From 10c01f373ec716d286dede8c270c093d9f858187 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 10:01:52 -0700 Subject: [PATCH 8/8] Update README.md try with replit --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index a362f5c..17982cd 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ _**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._ +[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo) + ## Installation ```bash