fix: filter dup on street, unit, city
parent
8e140a0e45
commit
ca260fd2b4
|
@ -140,7 +140,9 @@ def scrape_property(
|
|||
|
||||
if len(site_name) == 1:
|
||||
final_df = _scrape_single_site(location, site_name[0], listing_type)
|
||||
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
|
||||
final_df = final_df.drop_duplicates(
|
||||
subset=["street_address", "city", "unit"], keep="first"
|
||||
)
|
||||
return final_df
|
||||
|
||||
results = []
|
||||
|
@ -157,5 +159,7 @@ def scrape_property(
|
|||
if not results:
|
||||
return pd.DataFrame()
|
||||
final_df = pd.concat(results, ignore_index=True)
|
||||
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
|
||||
final_df = final_df.drop_duplicates(
|
||||
subset=["street_address", "city", "unit"], keep="first"
|
||||
)
|
||||
return final_df
|
||||
|
|
|
@ -3,7 +3,7 @@ from ..models import Property, Address
|
|||
from .. import Scraper
|
||||
from typing import Any, Generator
|
||||
from ....exceptions import NoResultsFound
|
||||
from ....utils import parse_address_two
|
||||
from ....utils import parse_address_two, parse_unit
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
|
||||
|
@ -108,8 +108,7 @@ class RealtorScraper(Scraper):
|
|||
response_json = response.json()
|
||||
|
||||
property_info = response_json["data"]["property"]
|
||||
street_address = property_info["address"]["line"]
|
||||
unit = parse_address_two(street_address)
|
||||
street_address, unit = parse_address_two(property_info["address"]["line"])
|
||||
|
||||
return [
|
||||
Property(
|
||||
|
@ -234,13 +233,16 @@ class RealtorScraper(Scraper):
|
|||
return []
|
||||
|
||||
for result in response_json["data"]["home_search"]["results"]:
|
||||
street_address, unit = parse_address_two(
|
||||
result["location"]["address"]["line"]
|
||||
)
|
||||
realty_property = Property(
|
||||
address=Address(
|
||||
street_address=result["location"]["address"]["line"],
|
||||
street_address=street_address,
|
||||
city=result["location"]["address"]["city"],
|
||||
state=result["location"]["address"]["state_code"],
|
||||
zip_code=result["location"]["address"]["postal_code"],
|
||||
unit=parse_address_two(result["location"]["address"]["unit"]),
|
||||
unit=parse_unit(result["location"]["address"]["unit"]),
|
||||
country="USA",
|
||||
),
|
||||
site_name=self.site_name,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import json
|
||||
from typing import Any
|
||||
from .. import Scraper
|
||||
from ....utils import parse_address_two
|
||||
from ....utils import parse_address_two, parse_unit
|
||||
from ..models import Property, Address, PropertyType
|
||||
|
||||
|
||||
|
@ -39,9 +39,10 @@ class RedfinScraper(Scraper):
|
|||
return home[key]["value"]
|
||||
|
||||
if not single_search:
|
||||
unit = parse_address_two(get_value("streetLine"))
|
||||
street_address, unit = parse_address_two(get_value("streetLine"))
|
||||
unit = parse_unit(get_value("streetLine"))
|
||||
address = Address(
|
||||
street_address=get_value("streetLine"),
|
||||
street_address=street_address,
|
||||
city=home["city"],
|
||||
state=home["state"],
|
||||
zip_code=home["zip"],
|
||||
|
@ -50,10 +51,11 @@ class RedfinScraper(Scraper):
|
|||
)
|
||||
else:
|
||||
address_info = home["streetAddress"]
|
||||
street_address, unit = parse_address_two(address_info["assembledAddress"])
|
||||
unit = parse_address_two(address_info["assembledAddress"])
|
||||
|
||||
address = Address(
|
||||
street_address=address_info["assembledAddress"],
|
||||
street_address=street_address,
|
||||
city=home["city"],
|
||||
state=home["state"],
|
||||
zip_code=home["zip"],
|
||||
|
@ -94,26 +96,30 @@ class RedfinScraper(Scraper):
|
|||
)
|
||||
|
||||
def _parse_building(self, building: dict) -> Property:
|
||||
street_address = " ".join(
|
||||
[
|
||||
building["address"]["streetNumber"],
|
||||
building["address"]["directionalPrefix"],
|
||||
building["address"]["streetName"],
|
||||
building["address"]["streetType"],
|
||||
]
|
||||
)
|
||||
street_address, unit = parse_address_two(street_address)
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
property_type=PropertyType("BUILDING"),
|
||||
address=Address(
|
||||
street_address=" ".join(
|
||||
[
|
||||
building["address"]["streetNumber"],
|
||||
building["address"]["directionalPrefix"],
|
||||
building["address"]["streetName"],
|
||||
building["address"]["streetType"],
|
||||
]
|
||||
),
|
||||
street_address=street_address,
|
||||
city=building["address"]["city"],
|
||||
state=building["address"]["stateOrProvinceCode"],
|
||||
zip_code=building["address"]["postalCode"],
|
||||
unit=" ".join(
|
||||
[
|
||||
building["address"]["unitType"],
|
||||
building["address"]["unitValue"],
|
||||
]
|
||||
unit=parse_unit(
|
||||
" ".join(
|
||||
[
|
||||
building["address"]["unitType"],
|
||||
building["address"]["unitValue"],
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import re
|
||||
import json
|
||||
from .. import Scraper
|
||||
from ....utils import parse_address_two
|
||||
from ....utils import parse_address_two, parse_unit
|
||||
from ....exceptions import NoResultsFound, PropertyNotFound
|
||||
from ..models import Property, Address, ListingType, PropertyType, SiteName
|
||||
|
||||
|
@ -129,8 +129,8 @@ class ZillowScraper(Scraper):
|
|||
if "hdpData" in result:
|
||||
home_info = result["hdpData"]["homeInfo"]
|
||||
address_data = {
|
||||
"street_address": home_info["streetAddress"],
|
||||
"unit": parse_address_two(home_info["unit"])
|
||||
"street_address": parse_address_two(home_info["streetAddress"])[0],
|
||||
"unit": parse_unit(home_info["unit"])
|
||||
if "unit" in home_info
|
||||
else None,
|
||||
"city": home_info["city"],
|
||||
|
@ -225,9 +225,10 @@ class ZillowScraper(Scraper):
|
|||
else property_data["hdpUrl"]
|
||||
)
|
||||
address_data = property_data["address"]
|
||||
street_address, unit = parse_address_two(address_data["streetAddress"])
|
||||
address = Address(
|
||||
street_address=address_data["streetAddress"],
|
||||
unit=parse_address_two(address_data["streetAddress"]),
|
||||
street_address=street_address,
|
||||
unit=unit,
|
||||
city=address_data["city"],
|
||||
state=address_data["state"],
|
||||
zip_code=address_data["zipcode"],
|
||||
|
@ -286,10 +287,11 @@ class ZillowScraper(Scraper):
|
|||
else:
|
||||
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||
|
||||
street_address, unit = parse_address_two(street_address)
|
||||
return Address(
|
||||
street_address=street_address,
|
||||
city=city,
|
||||
unit=parse_address_two(street_address),
|
||||
unit=unit,
|
||||
state=state,
|
||||
zip_code=zip_code,
|
||||
country="USA",
|
||||
|
|
|
@ -1,7 +1,29 @@
|
|||
import re
|
||||
|
||||
|
||||
def parse_address_two(street_address: str):
|
||||
def parse_address_two(street_address: str) -> tuple:
|
||||
if not street_address:
|
||||
return street_address, None
|
||||
|
||||
apt_match = re.search(
|
||||
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
|
||||
street_address,
|
||||
re.I,
|
||||
)
|
||||
|
||||
if apt_match:
|
||||
apt_str = apt_match.group().strip()
|
||||
cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
|
||||
|
||||
main_address = street_address.replace(
|
||||
apt_str, ""
|
||||
).strip() # Remove the matched part from the original address
|
||||
return main_address, cleaned_apt_str
|
||||
else:
|
||||
return street_address, None
|
||||
|
||||
|
||||
def parse_unit(street_address: str):
|
||||
if not street_address:
|
||||
return None
|
||||
apt_match = re.search(
|
||||
|
@ -19,7 +41,7 @@ def parse_address_two(street_address: str):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(parse_address_two("810 E Colter St APT 32"))
|
||||
print(parse_address_two("4303 E Cactus Rd Apt 126"))
|
||||
print(parse_address_two("1234 Elm Street apt 2B"))
|
||||
print(parse_address_two("1234 Elm Street UNIT 3A"))
|
||||
print(parse_address_two("1234 Elm Street unit 3A"))
|
||||
|
|
|
@ -9,7 +9,7 @@ def test_realtor():
|
|||
listing_type="for_sale",
|
||||
),
|
||||
scrape_property(
|
||||
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent"
|
||||
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
|
||||
), #: does not support "city, state, USA" format
|
||||
scrape_property(
|
||||
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
|
||||
|
|
|
@ -7,7 +7,7 @@ def test_redfin():
|
|||
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
|
||||
),
|
||||
scrape_property(
|
||||
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent"
|
||||
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
|
||||
),
|
||||
scrape_property(
|
||||
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
|
||||
|
|
|
@ -7,7 +7,7 @@ def test_zillow():
|
|||
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
|
||||
),
|
||||
scrape_property(
|
||||
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent"
|
||||
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
|
||||
),
|
||||
scrape_property(
|
||||
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
|
||||
|
|
Loading…
Reference in New Issue