- realtor radius

pull/31/head
Zachary Hampton 2023-10-02 13:58:47 -07:00
parent 1f1ca8068f
commit 40bbf76db1
4 changed files with 123 additions and 82 deletions

View File

@ -106,7 +106,7 @@ def _process_result(result: Property) -> pd.DataFrame:
return properties_df return properties_df
def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame: def _scrape_single_site(location: str, site_name: str, listing_type: str, radius: float, proxy: str = None) -> pd.DataFrame:
""" """
Helper function to scrape a single site. Helper function to scrape a single site.
""" """
@ -117,6 +117,7 @@ def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy:
listing_type=ListingType[listing_type.upper()], listing_type=ListingType[listing_type.upper()],
site_name=SiteName.get_by_value(site_name.lower()), site_name=SiteName.get_by_value(site_name.lower()),
proxy=proxy, proxy=proxy,
radius=radius,
) )
site = _scrapers[site_name.lower()](scraper_input) site = _scrapers[site_name.lower()](scraper_input)
@ -134,12 +135,14 @@ def scrape_property(
location: str, location: str,
site_name: Union[str, list[str]] = "realtor.com", site_name: Union[str, list[str]] = "realtor.com",
listing_type: str = "for_sale", listing_type: str = "for_sale",
radius: float = None,
proxy: str = None, proxy: str = None,
keep_duplicates: bool = False keep_duplicates: bool = False
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape property from various sites from a given location and listing type. Scrape property from various sites from a given location and listing type.
:param radius: Radius in miles to find comparable properties on individual addresses
:param keep_duplicates: :param keep_duplicates:
:param proxy: :param proxy:
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
@ -157,12 +160,12 @@ def scrape_property(
results = [] results = []
if len(site_name) == 1: if len(site_name) == 1:
final_df = _scrape_single_site(location, site_name[0], listing_type, proxy) final_df = _scrape_single_site(location, site_name[0], listing_type, radius, proxy)
results.append(final_df) results.append(final_df)
else: else:
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
futures = { futures = {
executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name executor.submit(_scrape_single_site, location, s_name, listing_type, radius, proxy): s_name
for s_name in site_name for s_name in site_name
} }

View File

@ -9,6 +9,7 @@ class ScraperInput:
location: str location: str
listing_type: ListingType listing_type: ListingType
site_name: SiteName site_name: SiteName
radius: float | None = None
proxy: str | None = None proxy: str | None = None
@ -29,6 +30,7 @@ class Scraper:
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name
self.radius = scraper_input.radius
def search(self) -> list[Property]: def search(self) -> list[Property]:
... ...

View File

@ -153,29 +153,12 @@ class RealtorScraper(Scraper):
) )
] ]
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: def handle_area(self, variables: dict, is_for_comps: bool = False, return_total: bool = False) -> list[Property] | int:
""" """
Handles a location area & returns a list of properties Handles a location area & returns a list of properties
""" """
query = (
"""query Home_search( results_query = """{
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int,
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
}
limit: 200
offset: $offset
) {
count count
total total
results { results {
@ -219,10 +202,41 @@ class RealtorScraper(Scraper):
id id
} }
} }
}}"""
if not is_for_comps:
query = (
"""query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int,
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
} }
}""" limit: 200
% self.listing_type.value.lower() offset: $offset
) ) %s"""
% (self.listing_type.value.lower(), results_query))
else:
query = (
"""query Property_search(
$coordinates: [Float]!
$radius: String!
$offset: Int!,
) {
property_search(
query: { nearby: { coordinates: $coordinates, radius: $radius } }
limit: 200
offset: $offset
) %s""" % results_query)
payload = { payload = {
"query": query, "query": query,
@ -232,9 +246,10 @@ class RealtorScraper(Scraper):
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.search_url, json=payload)
response.raise_for_status() response.raise_for_status()
response_json = response.json() response_json = response.json()
search_key = "home_search" if not is_for_comps else "property_search"
if return_total: if return_total:
return response_json["data"]["home_search"]["total"] return response_json["data"][search_key]["total"]
properties: list[Property] = [] properties: list[Property] = []
@ -242,13 +257,13 @@ class RealtorScraper(Scraper):
response_json is None response_json is None
or "data" not in response_json or "data" not in response_json
or response_json["data"] is None or response_json["data"] is None
or "home_search" not in response_json["data"] or search_key not in response_json["data"]
or response_json["data"]["home_search"] is None or response_json["data"][search_key] is None
or "results" not in response_json["data"]["home_search"] or "results" not in response_json["data"][search_key]
): ):
return [] return []
for result in response_json["data"]["home_search"]["results"]: for result in response_json["data"][search_key]["results"]:
self.counter += 1 self.counter += 1
address_one, _ = parse_address_one(result["location"]["address"]["line"]) address_one, _ = parse_address_one(result["location"]["address"]["line"])
realty_property = Property( realty_property = Property(
@ -297,12 +312,15 @@ class RealtorScraper(Scraper):
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]
is_for_comps = self.radius is not None and location_type == "address"
if location_type == "address": if location_type == "address" and not is_for_comps:
property_id = location_info["mpr_id"] property_id = location_info["mpr_id"]
return self.handle_address(property_id) return self.handle_address(property_id)
offset = 0 offset = 0
if not is_for_comps:
search_variables = { search_variables = {
"city": location_info.get("city"), "city": location_info.get("city"),
"county": location_info.get("county"), "county": location_info.get("county"),
@ -310,8 +328,15 @@ class RealtorScraper(Scraper):
"postal_code": location_info.get("postal_code"), "postal_code": location_info.get("postal_code"),
"offset": offset, "offset": offset,
} }
else:
coordinates = list(location_info["centroid"].values())
search_variables = {
"coordinates": coordinates,
"radius": "{}mi".format(self.radius),
"offset": offset,
}
total = self.handle_area(search_variables, return_total=True) total = self.handle_area(search_variables, return_total=True, is_for_comps=is_for_comps)
homes = [] homes = []
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
@ -320,6 +345,7 @@ class RealtorScraper(Scraper):
self.handle_area, self.handle_area,
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
return_total=False, return_total=False,
is_for_comps=is_for_comps,
) )
for i in range(0, total, 200) for i in range(0, total, 200)
] ]

View File

@ -7,6 +7,16 @@ from homeharvest.exceptions import (
) )
def test_realtor_comps():
result = scrape_property(
location="2530 Al Lipscomb Way",
site_name="realtor.com",
radius=0.5,
)
print(result)
def test_realtor(): def test_realtor():
results = [ results = [
scrape_property( scrape_property(