fix: normalize unit num

pull/2/head
Cullen Watson 2023-09-18 17:04:34 -05:00
parent c7a4bfd5e4
commit 588689c230
4 changed files with 25 additions and 27 deletions

View File

@ -120,7 +120,7 @@ def _scrape_single_site(
def scrape_property( def scrape_property(
location: str, location: str,
site_name: Union[str, list[str]], site_name: Union[str, list[str]] = list(_scrapers.keys()),
listing_type: str = "for_sale", listing_type: str = "for_sale",
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
@ -139,7 +139,9 @@ def scrape_property(
site_name = [site_name] site_name = [site_name]
if len(site_name) == 1: if len(site_name) == 1:
return _scrape_single_site(location, site_name[0], listing_type) final_df = _scrape_single_site(location, site_name[0], listing_type)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
return final_df
results = [] results = []
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:

View File

@ -240,7 +240,7 @@ class RealtorScraper(Scraper):
city=result["location"]["address"]["city"], city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"], state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"], zip_code=result["location"]["address"]["postal_code"],
unit=result["location"]["address"]["unit"], unit=parse_address_two(result["location"]["address"]["unit"]),
country="USA", country="USA",
), ),
site_name=self.site_name, site_name=self.site_name,

View File

@ -130,7 +130,7 @@ class ZillowScraper(Scraper):
home_info = result["hdpData"]["homeInfo"] home_info = result["hdpData"]["homeInfo"]
address_data = { address_data = {
"street_address": home_info["streetAddress"], "street_address": home_info["streetAddress"],
"unit": home_info.get("unit"), "unit": parse_address_two(home_info['unit']) if 'unit' in home_info else None,
"city": home_info["city"], "city": home_info["city"],
"state": home_info["state"], "state": home_info["state"],
"zip_code": home_info["zipcode"], "zip_code": home_info["zipcode"],
@ -213,22 +213,6 @@ class ZillowScraper(Scraper):
return properties_list return properties_list
def _extract_units(self, result: dict):
units = {}
if "units" in result:
num_units = result.get("availabilityCount", len(result["units"]))
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in result["units"]
]
units["apt_availability_count"] = num_units
units["apt_min_unit_price"] = min(prices)
units["apt_max_unit_price"] = max(prices)
units["apt_avg_unit_price"] = (
sum(prices) // num_units if num_units else None
)
return units
def _get_single_property_page(self, property_data: dict): def _get_single_property_page(self, property_data: dict):
""" """
This method is used when a user enters the exact location & zillow returns just one property This method is used when a user enters the exact location & zillow returns just one property
@ -239,10 +223,9 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
unit = parse_address_two(address_data["streetAddress"])
address = Address( address = Address(
street_address=address_data["streetAddress"], street_address=address_data["streetAddress"],
unit=unit, unit=parse_address_two(address_data["streetAddress"]),
city=address_data["city"], city=address_data["city"],
state=address_data["state"], state=address_data["state"],
zip_code=address_data["zipcode"], zip_code=address_data["zipcode"],
@ -301,11 +284,10 @@ class ZillowScraper(Scraper):
else: else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}") raise ValueError(f"Unexpected state/zip format in address: {address_str}")
unit = parse_address_two(street_address)
return Address( return Address(
street_address=street_address, street_address=street_address,
city=city, city=city,
unit=unit, unit=parse_address_two(street_address),
state=state, state=state,
zip_code=zip_code, zip_code=zip_code,
country="USA", country="USA",

View File

@ -1,6 +1,20 @@
import re import re
def parse_address_two(address_one: str): def parse_address_two(street_address: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) if not street_address:
return apt_match.group().strip() if apt_match else None return None
apt_match = re.search(r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", street_address, re.I)
if apt_match:
apt_str = apt_match.group().strip()
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
return apt_str
else:
return None
if __name__ == "__main__":
print(parse_address_two("810 E Colter St APT 32"))
print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A"))