fix: normalize unit num
parent
c7a4bfd5e4
commit
588689c230
|
@ -120,7 +120,7 @@ def _scrape_single_site(
|
||||||
|
|
||||||
def scrape_property(
|
def scrape_property(
|
||||||
location: str,
|
location: str,
|
||||||
site_name: Union[str, list[str]],
|
site_name: Union[str, list[str]] = list(_scrapers.keys()),
|
||||||
listing_type: str = "for_sale",
|
listing_type: str = "for_sale",
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
@ -139,7 +139,9 @@ def scrape_property(
|
||||||
site_name = [site_name]
|
site_name = [site_name]
|
||||||
|
|
||||||
if len(site_name) == 1:
|
if len(site_name) == 1:
|
||||||
return _scrape_single_site(location, site_name[0], listing_type)
|
final_df = _scrape_single_site(location, site_name[0], listing_type)
|
||||||
|
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
|
||||||
|
return final_df
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
|
|
|
@ -240,7 +240,7 @@ class RealtorScraper(Scraper):
|
||||||
city=result["location"]["address"]["city"],
|
city=result["location"]["address"]["city"],
|
||||||
state=result["location"]["address"]["state_code"],
|
state=result["location"]["address"]["state_code"],
|
||||||
zip_code=result["location"]["address"]["postal_code"],
|
zip_code=result["location"]["address"]["postal_code"],
|
||||||
unit=result["location"]["address"]["unit"],
|
unit=parse_address_two(result["location"]["address"]["unit"]),
|
||||||
country="USA",
|
country="USA",
|
||||||
),
|
),
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
|
|
|
@ -130,7 +130,7 @@ class ZillowScraper(Scraper):
|
||||||
home_info = result["hdpData"]["homeInfo"]
|
home_info = result["hdpData"]["homeInfo"]
|
||||||
address_data = {
|
address_data = {
|
||||||
"street_address": home_info["streetAddress"],
|
"street_address": home_info["streetAddress"],
|
||||||
"unit": home_info.get("unit"),
|
"unit": parse_address_two(home_info['unit']) if 'unit' in home_info else None,
|
||||||
"city": home_info["city"],
|
"city": home_info["city"],
|
||||||
"state": home_info["state"],
|
"state": home_info["state"],
|
||||||
"zip_code": home_info["zipcode"],
|
"zip_code": home_info["zipcode"],
|
||||||
|
@ -213,22 +213,6 @@ class ZillowScraper(Scraper):
|
||||||
|
|
||||||
return properties_list
|
return properties_list
|
||||||
|
|
||||||
def _extract_units(self, result: dict):
|
|
||||||
units = {}
|
|
||||||
if "units" in result:
|
|
||||||
num_units = result.get("availabilityCount", len(result["units"]))
|
|
||||||
prices = [
|
|
||||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
|
||||||
for unit in result["units"]
|
|
||||||
]
|
|
||||||
units["apt_availability_count"] = num_units
|
|
||||||
units["apt_min_unit_price"] = min(prices)
|
|
||||||
units["apt_max_unit_price"] = max(prices)
|
|
||||||
units["apt_avg_unit_price"] = (
|
|
||||||
sum(prices) // num_units if num_units else None
|
|
||||||
)
|
|
||||||
return units
|
|
||||||
|
|
||||||
def _get_single_property_page(self, property_data: dict):
|
def _get_single_property_page(self, property_data: dict):
|
||||||
"""
|
"""
|
||||||
This method is used when a user enters the exact location & zillow returns just one property
|
This method is used when a user enters the exact location & zillow returns just one property
|
||||||
|
@ -239,10 +223,9 @@ class ZillowScraper(Scraper):
|
||||||
else property_data["hdpUrl"]
|
else property_data["hdpUrl"]
|
||||||
)
|
)
|
||||||
address_data = property_data["address"]
|
address_data = property_data["address"]
|
||||||
unit = parse_address_two(address_data["streetAddress"])
|
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=address_data["streetAddress"],
|
street_address=address_data["streetAddress"],
|
||||||
unit=unit,
|
unit=parse_address_two(address_data["streetAddress"]),
|
||||||
city=address_data["city"],
|
city=address_data["city"],
|
||||||
state=address_data["state"],
|
state=address_data["state"],
|
||||||
zip_code=address_data["zipcode"],
|
zip_code=address_data["zipcode"],
|
||||||
|
@ -301,11 +284,10 @@ class ZillowScraper(Scraper):
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||||
|
|
||||||
unit = parse_address_two(street_address)
|
|
||||||
return Address(
|
return Address(
|
||||||
street_address=street_address,
|
street_address=street_address,
|
||||||
city=city,
|
city=city,
|
||||||
unit=unit,
|
unit=parse_address_two(street_address),
|
||||||
state=state,
|
state=state,
|
||||||
zip_code=zip_code,
|
zip_code=zip_code,
|
||||||
country="USA",
|
country="USA",
|
||||||
|
|
|
@ -1,6 +1,20 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def parse_address_two(address_one: str):
|
def parse_address_two(street_address: str):
|
||||||
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
|
if not street_address:
|
||||||
return apt_match.group().strip() if apt_match else None
|
return None
|
||||||
|
apt_match = re.search(r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", street_address, re.I)
|
||||||
|
|
||||||
|
if apt_match:
|
||||||
|
apt_str = apt_match.group().strip()
|
||||||
|
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
|
||||||
|
return apt_str
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(parse_address_two("810 E Colter St APT 32"))
|
||||||
|
print(parse_address_two("1234 Elm Street apt 2B"))
|
||||||
|
print(parse_address_two("1234 Elm Street UNIT 3A"))
|
||||||
|
print(parse_address_two("1234 Elm Street unit 3A"))
|
||||||
|
|
Loading…
Reference in New Issue