From 40bbf76db103bd6895f3140d306b65bb1f644cde Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 2 Oct 2023 13:58:47 -0700 Subject: [PATCH] - realtor radius --- homeharvest/__init__.py | 9 +- homeharvest/core/scrapers/__init__.py | 2 + homeharvest/core/scrapers/realtor/__init__.py | 184 ++++++++++-------- tests/test_realtor.py | 10 + 4 files changed, 123 insertions(+), 82 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 8fe7d0d..f489674 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -106,7 +106,7 @@ def _process_result(result: Property) -> pd.DataFrame: return properties_df -def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame: +def _scrape_single_site(location: str, site_name: str, listing_type: str, radius: float, proxy: str = None) -> pd.DataFrame: """ Helper function to scrape a single site. """ @@ -117,6 +117,7 @@ def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: listing_type=ListingType[listing_type.upper()], site_name=SiteName.get_by_value(site_name.lower()), proxy=proxy, + radius=radius, ) site = _scrapers[site_name.lower()](scraper_input) @@ -134,12 +135,14 @@ def scrape_property( location: str, site_name: Union[str, list[str]] = "realtor.com", listing_type: str = "for_sale", + radius: float = None, proxy: str = None, keep_duplicates: bool = False ) -> pd.DataFrame: """ Scrape property from various sites from a given location and listing type. + :param radius: Radius in miles to find comparable properties on individual addresses :param keep_duplicates: :param proxy: :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') @@ -157,12 +160,12 @@ def scrape_property( results = [] if len(site_name) == 1: - final_df = _scrape_single_site(location, site_name[0], listing_type, proxy) + final_df = _scrape_single_site(location, site_name[0], listing_type, radius, proxy) results.append(final_df) else: with ThreadPoolExecutor() as executor: futures = { - executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name + executor.submit(_scrape_single_site, location, s_name, listing_type, radius, proxy): s_name for s_name in site_name } diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index e900dbe..0ab548b 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -9,6 +9,7 @@ class ScraperInput: location: str listing_type: ListingType site_name: SiteName + radius: float | None = None proxy: str | None = None @@ -29,6 +30,7 @@ class Scraper: self.listing_type = scraper_input.listing_type self.site_name = scraper_input.site_name + self.radius = scraper_input.radius def search(self) -> list[Property]: ... diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 78ecc84..e1cb8e7 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -153,76 +153,90 @@ class RealtorScraper(Scraper): ) ] - def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: + def handle_area(self, variables: dict, is_for_comps: bool = False, return_total: bool = False) -> list[Property] | int: """ Handles a location area & returns a list of properties """ - query = ( - """query Home_search( - $city: String, - $county: [String], - $state_code: String, - $postal_code: String - $offset: Int, - ) { - home_search( - query: { - city: $city - county: $county - postal_code: $postal_code - state_code: $state_code - status: %s - } - limit: 200 - offset: $offset + + results_query = """{ + count + total + results { + property_id + description { + baths + beds + lot_sqft + sqft + text + sold_price + stories + year_built + garage + unit_number + floor_number + } + location { + address { + city + country + line + postal_code + state_code + state + street_direction + street_name + street_number + street_post_direction + street_suffix + unit + coordinate { + lon + lat + } + } + } + list_price + price_per_sqft + source { + id + } + } + }}""" + + if not is_for_comps: + query = ( + """query Home_search( + $city: String, + $county: [String], + $state_code: String, + $postal_code: String + $offset: Int, ) { - count - total - results { - property_id - description { - baths - beds - lot_sqft - sqft - text - sold_price - stories - year_built - garage - unit_number - floor_number + home_search( + query: { + city: $city + county: $county + postal_code: $postal_code + state_code: $state_code + status: %s } - location { - address { - city - country - line - postal_code - state_code - state - street_direction - street_name - street_number - street_post_direction - street_suffix - unit - coordinate { - lon - lat - } - } - } - list_price - price_per_sqft - source { - id - } - } - } - }""" - % self.listing_type.value.lower() - ) + limit: 200 + offset: $offset + ) %s""" + % (self.listing_type.value.lower(), results_query)) + else: + query = ( + """query Property_search( + $coordinates: [Float]! + $radius: String! + $offset: Int!, + ) { + property_search( + query: { nearby: { coordinates: $coordinates, radius: $radius } } + limit: 200 + offset: $offset + ) %s""" % results_query) payload = { "query": query, @@ -232,9 +246,10 @@ class RealtorScraper(Scraper): response = self.session.post(self.search_url, json=payload) response.raise_for_status() response_json = response.json() + search_key = "home_search" if not is_for_comps else "property_search" if return_total: - return response_json["data"]["home_search"]["total"] + return response_json["data"][search_key]["total"] properties: list[Property] = [] @@ -242,13 +257,13 @@ class RealtorScraper(Scraper): response_json is None or "data" not in response_json or response_json["data"] is None - or "home_search" not in response_json["data"] - or response_json["data"]["home_search"] is None - or "results" not in response_json["data"]["home_search"] + or search_key not in response_json["data"] + or response_json["data"][search_key] is None + or "results" not in response_json["data"][search_key] ): return [] - for result in response_json["data"]["home_search"]["results"]: + for result in response_json["data"][search_key]["results"]: self.counter += 1 address_one, _ = parse_address_one(result["location"]["address"]["line"]) realty_property = Property( @@ -297,21 +312,31 @@ class RealtorScraper(Scraper): def search(self): location_info = self.handle_location() location_type = location_info["area_type"] + is_for_comps = self.radius is not None and location_type == "address" - if location_type == "address": + if location_type == "address" and not is_for_comps: property_id = location_info["mpr_id"] return self.handle_address(property_id) offset = 0 - search_variables = { - "city": location_info.get("city"), - "county": location_info.get("county"), - "state_code": location_info.get("state_code"), - "postal_code": location_info.get("postal_code"), - "offset": offset, - } - total = self.handle_area(search_variables, return_total=True) + if not is_for_comps: + search_variables = { + "city": location_info.get("city"), + "county": location_info.get("county"), + "state_code": location_info.get("state_code"), + "postal_code": location_info.get("postal_code"), + "offset": offset, + } + else: + coordinates = list(location_info["centroid"].values()) + search_variables = { + "coordinates": coordinates, + "radius": "{}mi".format(self.radius), + "offset": offset, + } + + total = self.handle_area(search_variables, return_total=True, is_for_comps=is_for_comps) homes = [] with ThreadPoolExecutor(max_workers=10) as executor: @@ -320,6 +345,7 @@ class RealtorScraper(Scraper): self.handle_area, variables=search_variables | {"offset": i}, return_total=False, + is_for_comps=is_for_comps, ) for i in range(0, total, 200) ] diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 3b23529..db8cb51 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -7,6 +7,16 @@ from homeharvest.exceptions import ( ) +def test_realtor_comps(): + result = scrape_property( + location="2530 Al Lipscomb Way", + site_name="realtor.com", + radius=0.5, + ) + + print(result) + + def test_realtor(): results = [ scrape_property(