fix: filter dup on street, unit, city

pull/3/head
Cullen Watson 2023-09-18 17:42:16 -05:00
parent 8e140a0e45
commit ca260fd2b4
8 changed files with 71 additions and 35 deletions

View File

@ -140,7 +140,9 @@ def scrape_property(
if len(site_name) == 1:
final_df = _scrape_single_site(location, site_name[0], listing_type)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
final_df = final_df.drop_duplicates(
subset=["street_address", "city", "unit"], keep="first"
)
return final_df
results = []
@ -157,5 +159,7 @@ def scrape_property(
if not results:
return pd.DataFrame()
final_df = pd.concat(results, ignore_index=True)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
final_df = final_df.drop_duplicates(
subset=["street_address", "city", "unit"], keep="first"
)
return final_df

View File

@ -3,7 +3,7 @@ from ..models import Property, Address
from .. import Scraper
from typing import Any, Generator
from ....exceptions import NoResultsFound
from ....utils import parse_address_two
from ....utils import parse_address_two, parse_unit
from concurrent.futures import ThreadPoolExecutor, as_completed
@ -108,8 +108,7 @@ class RealtorScraper(Scraper):
response_json = response.json()
property_info = response_json["data"]["property"]
street_address = property_info["address"]["line"]
unit = parse_address_two(street_address)
street_address, unit = parse_address_two(property_info["address"]["line"])
return [
Property(
@ -234,13 +233,16 @@ class RealtorScraper(Scraper):
return []
for result in response_json["data"]["home_search"]["results"]:
street_address, unit = parse_address_two(
result["location"]["address"]["line"]
)
realty_property = Property(
address=Address(
street_address=result["location"]["address"]["line"],
street_address=street_address,
city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"],
unit=parse_address_two(result["location"]["address"]["unit"]),
unit=parse_unit(result["location"]["address"]["unit"]),
country="USA",
),
site_name=self.site_name,

View File

@ -1,7 +1,7 @@
import json
from typing import Any
from .. import Scraper
from ....utils import parse_address_two
from ....utils import parse_address_two, parse_unit
from ..models import Property, Address, PropertyType
@ -39,9 +39,10 @@ class RedfinScraper(Scraper):
return home[key]["value"]
if not single_search:
unit = parse_address_two(get_value("streetLine"))
street_address, unit = parse_address_two(get_value("streetLine"))
unit = parse_unit(get_value("streetLine"))
address = Address(
street_address=get_value("streetLine"),
street_address=street_address,
city=home["city"],
state=home["state"],
zip_code=home["zip"],
@ -50,10 +51,11 @@ class RedfinScraper(Scraper):
)
else:
address_info = home["streetAddress"]
street_address, unit = parse_address_two(address_info["assembledAddress"])
unit = parse_address_two(address_info["assembledAddress"])
address = Address(
street_address=address_info["assembledAddress"],
street_address=street_address,
city=home["city"],
state=home["state"],
zip_code=home["zip"],
@ -94,26 +96,30 @@ class RedfinScraper(Scraper):
)
def _parse_building(self, building: dict) -> Property:
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
street_address, unit = parse_address_two(street_address)
return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address(
street_address=" ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
),
street_address=street_address,
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"],
unit=" ".join(
[
building["address"]["unitType"],
building["address"]["unitValue"],
]
unit=parse_unit(
" ".join(
[
building["address"]["unitType"],
building["address"]["unitValue"],
]
)
),
),
property_url="https://www.redfin.com{}".format(building["url"]),

View File

@ -1,7 +1,7 @@
import re
import json
from .. import Scraper
from ....utils import parse_address_two
from ....utils import parse_address_two, parse_unit
from ....exceptions import NoResultsFound, PropertyNotFound
from ..models import Property, Address, ListingType, PropertyType, SiteName
@ -129,8 +129,8 @@ class ZillowScraper(Scraper):
if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"]
address_data = {
"street_address": home_info["streetAddress"],
"unit": parse_address_two(home_info["unit"])
"street_address": parse_address_two(home_info["streetAddress"])[0],
"unit": parse_unit(home_info["unit"])
if "unit" in home_info
else None,
"city": home_info["city"],
@ -225,9 +225,10 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"]
)
address_data = property_data["address"]
street_address, unit = parse_address_two(address_data["streetAddress"])
address = Address(
street_address=address_data["streetAddress"],
unit=parse_address_two(address_data["streetAddress"]),
street_address=street_address,
unit=unit,
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
@ -286,10 +287,11 @@ class ZillowScraper(Scraper):
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
street_address, unit = parse_address_two(street_address)
return Address(
street_address=street_address,
city=city,
unit=parse_address_two(street_address),
unit=unit,
state=state,
zip_code=zip_code,
country="USA",

View File

@ -1,7 +1,29 @@
import re
def parse_address_two(street_address: str):
def parse_address_two(street_address: str) -> tuple:
if not street_address:
return street_address, None
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
street_address,
re.I,
)
if apt_match:
apt_str = apt_match.group().strip()
cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
main_address = street_address.replace(
apt_str, ""
).strip() # Remove the matched part from the original address
return main_address, cleaned_apt_str
else:
return street_address, None
def parse_unit(street_address: str):
if not street_address:
return None
apt_match = re.search(
@ -19,7 +41,7 @@ def parse_address_two(street_address: str):
if __name__ == "__main__":
print(parse_address_two("810 E Colter St APT 32"))
print(parse_address_two("4303 E Cactus Rd Apt 126"))
print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A"))

View File

@ -9,7 +9,7 @@ def test_realtor():
listing_type="for_sale",
),
scrape_property(
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent"
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold"

View File

@ -7,7 +7,7 @@ def test_redfin():
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
),
scrape_property(
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent"
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"

View File

@ -7,7 +7,7 @@ def test_zillow():
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
),
scrape_property(
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent"
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"