fix: filter dup on street, unit, city

pull/3/head
Cullen Watson 2023-09-18 17:42:16 -05:00
parent 8e140a0e45
commit ca260fd2b4
8 changed files with 71 additions and 35 deletions

View File

@ -140,7 +140,9 @@ def scrape_property(
if len(site_name) == 1: if len(site_name) == 1:
final_df = _scrape_single_site(location, site_name[0], listing_type) final_df = _scrape_single_site(location, site_name[0], listing_type)
final_df = final_df.drop_duplicates(subset="street_address", keep="first") final_df = final_df.drop_duplicates(
subset=["street_address", "city", "unit"], keep="first"
)
return final_df return final_df
results = [] results = []
@ -157,5 +159,7 @@ def scrape_property(
if not results: if not results:
return pd.DataFrame() return pd.DataFrame()
final_df = pd.concat(results, ignore_index=True) final_df = pd.concat(results, ignore_index=True)
final_df = final_df.drop_duplicates(subset="street_address", keep="first") final_df = final_df.drop_duplicates(
subset=["street_address", "city", "unit"], keep="first"
)
return final_df return final_df

View File

@ -3,7 +3,7 @@ from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any, Generator from typing import Any, Generator
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
from ....utils import parse_address_two from ....utils import parse_address_two, parse_unit
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -108,8 +108,7 @@ class RealtorScraper(Scraper):
response_json = response.json() response_json = response.json()
property_info = response_json["data"]["property"] property_info = response_json["data"]["property"]
street_address = property_info["address"]["line"] street_address, unit = parse_address_two(property_info["address"]["line"])
unit = parse_address_two(street_address)
return [ return [
Property( Property(
@ -234,13 +233,16 @@ class RealtorScraper(Scraper):
return [] return []
for result in response_json["data"]["home_search"]["results"]: for result in response_json["data"]["home_search"]["results"]:
street_address, unit = parse_address_two(
result["location"]["address"]["line"]
)
realty_property = Property( realty_property = Property(
address=Address( address=Address(
street_address=result["location"]["address"]["line"], street_address=street_address,
city=result["location"]["address"]["city"], city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"], state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"], zip_code=result["location"]["address"]["postal_code"],
unit=parse_address_two(result["location"]["address"]["unit"]), unit=parse_unit(result["location"]["address"]["unit"]),
country="USA", country="USA",
), ),
site_name=self.site_name, site_name=self.site_name,

View File

@ -1,7 +1,7 @@
import json import json
from typing import Any from typing import Any
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two from ....utils import parse_address_two, parse_unit
from ..models import Property, Address, PropertyType from ..models import Property, Address, PropertyType
@ -39,9 +39,10 @@ class RedfinScraper(Scraper):
return home[key]["value"] return home[key]["value"]
if not single_search: if not single_search:
unit = parse_address_two(get_value("streetLine")) street_address, unit = parse_address_two(get_value("streetLine"))
unit = parse_unit(get_value("streetLine"))
address = Address( address = Address(
street_address=get_value("streetLine"), street_address=street_address,
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
@ -50,10 +51,11 @@ class RedfinScraper(Scraper):
) )
else: else:
address_info = home["streetAddress"] address_info = home["streetAddress"]
street_address, unit = parse_address_two(address_info["assembledAddress"])
unit = parse_address_two(address_info["assembledAddress"]) unit = parse_address_two(address_info["assembledAddress"])
address = Address( address = Address(
street_address=address_info["assembledAddress"], street_address=street_address,
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
@ -94,26 +96,30 @@ class RedfinScraper(Scraper):
) )
def _parse_building(self, building: dict) -> Property: def _parse_building(self, building: dict) -> Property:
return Property( street_address = " ".join(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address(
street_address=" ".join(
[ [
building["address"]["streetNumber"], building["address"]["streetNumber"],
building["address"]["directionalPrefix"], building["address"]["directionalPrefix"],
building["address"]["streetName"], building["address"]["streetName"],
building["address"]["streetType"], building["address"]["streetType"],
] ]
), )
street_address, unit = parse_address_two(street_address)
return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address(
street_address=street_address,
city=building["address"]["city"], city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"], state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"], zip_code=building["address"]["postalCode"],
unit=" ".join( unit=parse_unit(
" ".join(
[ [
building["address"]["unitType"], building["address"]["unitType"],
building["address"]["unitValue"], building["address"]["unitValue"],
] ]
)
), ),
), ),
property_url="https://www.redfin.com{}".format(building["url"]), property_url="https://www.redfin.com{}".format(building["url"]),

View File

@ -1,7 +1,7 @@
import re import re
import json import json
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two from ....utils import parse_address_two, parse_unit
from ....exceptions import NoResultsFound, PropertyNotFound from ....exceptions import NoResultsFound, PropertyNotFound
from ..models import Property, Address, ListingType, PropertyType, SiteName from ..models import Property, Address, ListingType, PropertyType, SiteName
@ -129,8 +129,8 @@ class ZillowScraper(Scraper):
if "hdpData" in result: if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"] home_info = result["hdpData"]["homeInfo"]
address_data = { address_data = {
"street_address": home_info["streetAddress"], "street_address": parse_address_two(home_info["streetAddress"])[0],
"unit": parse_address_two(home_info["unit"]) "unit": parse_unit(home_info["unit"])
if "unit" in home_info if "unit" in home_info
else None, else None,
"city": home_info["city"], "city": home_info["city"],
@ -225,9 +225,10 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
street_address, unit = parse_address_two(address_data["streetAddress"])
address = Address( address = Address(
street_address=address_data["streetAddress"], street_address=street_address,
unit=parse_address_two(address_data["streetAddress"]), unit=unit,
city=address_data["city"], city=address_data["city"],
state=address_data["state"], state=address_data["state"],
zip_code=address_data["zipcode"], zip_code=address_data["zipcode"],
@ -286,10 +287,11 @@ class ZillowScraper(Scraper):
else: else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}") raise ValueError(f"Unexpected state/zip format in address: {address_str}")
street_address, unit = parse_address_two(street_address)
return Address( return Address(
street_address=street_address, street_address=street_address,
city=city, city=city,
unit=parse_address_two(street_address), unit=unit,
state=state, state=state,
zip_code=zip_code, zip_code=zip_code,
country="USA", country="USA",

View File

@ -1,7 +1,29 @@
import re import re
def parse_address_two(street_address: str): def parse_address_two(street_address: str) -> tuple:
if not street_address:
return street_address, None
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
street_address,
re.I,
)
if apt_match:
apt_str = apt_match.group().strip()
cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
main_address = street_address.replace(
apt_str, ""
).strip() # Remove the matched part from the original address
return main_address, cleaned_apt_str
else:
return street_address, None
def parse_unit(street_address: str):
if not street_address: if not street_address:
return None return None
apt_match = re.search( apt_match = re.search(
@ -19,7 +41,7 @@ def parse_address_two(street_address: str):
if __name__ == "__main__": if __name__ == "__main__":
print(parse_address_two("810 E Colter St APT 32")) print(parse_address_two("4303 E Cactus Rd Apt 126"))
print(parse_address_two("1234 Elm Street apt 2B")) print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A")) print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A")) print(parse_address_two("1234 Elm Street unit 3A"))

View File

@ -9,7 +9,7 @@ def test_realtor():
listing_type="for_sale", listing_type="for_sale",
), ),
scrape_property( scrape_property(
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent" location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
), #: does not support "city, state, USA" format ), #: does not support "city, state, USA" format
scrape_property( scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold" location="Dallas, TX", site_name="realtor.com", listing_type="sold"

View File

@ -7,7 +7,7 @@ def test_redfin():
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale" location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
), ),
scrape_property( scrape_property(
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent" location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
), ),
scrape_property( scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold" location="Dallas, TX, USA", site_name="redfin", listing_type="sold"

View File

@ -7,7 +7,7 @@ def test_zillow():
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale" location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
), ),
scrape_property( scrape_property(
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent" location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
), ),
scrape_property( scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold" location="Dallas, TX, USA", site_name="zillow", listing_type="sold"