fix: filter dup on street, unit, city
parent
8e140a0e45
commit
ca260fd2b4
|
@ -140,7 +140,9 @@ def scrape_property(
|
||||||
|
|
||||||
if len(site_name) == 1:
|
if len(site_name) == 1:
|
||||||
final_df = _scrape_single_site(location, site_name[0], listing_type)
|
final_df = _scrape_single_site(location, site_name[0], listing_type)
|
||||||
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
|
final_df = final_df.drop_duplicates(
|
||||||
|
subset=["street_address", "city", "unit"], keep="first"
|
||||||
|
)
|
||||||
return final_df
|
return final_df
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
@ -157,5 +159,7 @@ def scrape_property(
|
||||||
if not results:
|
if not results:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
final_df = pd.concat(results, ignore_index=True)
|
final_df = pd.concat(results, ignore_index=True)
|
||||||
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
|
final_df = final_df.drop_duplicates(
|
||||||
|
subset=["street_address", "city", "unit"], keep="first"
|
||||||
|
)
|
||||||
return final_df
|
return final_df
|
||||||
|
|
|
@ -3,7 +3,7 @@ from ..models import Property, Address
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from typing import Any, Generator
|
from typing import Any, Generator
|
||||||
from ....exceptions import NoResultsFound
|
from ....exceptions import NoResultsFound
|
||||||
from ....utils import parse_address_two
|
from ....utils import parse_address_two, parse_unit
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,8 +108,7 @@ class RealtorScraper(Scraper):
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
property_info = response_json["data"]["property"]
|
property_info = response_json["data"]["property"]
|
||||||
street_address = property_info["address"]["line"]
|
street_address, unit = parse_address_two(property_info["address"]["line"])
|
||||||
unit = parse_address_two(street_address)
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
Property(
|
Property(
|
||||||
|
@ -234,13 +233,16 @@ class RealtorScraper(Scraper):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for result in response_json["data"]["home_search"]["results"]:
|
for result in response_json["data"]["home_search"]["results"]:
|
||||||
|
street_address, unit = parse_address_two(
|
||||||
|
result["location"]["address"]["line"]
|
||||||
|
)
|
||||||
realty_property = Property(
|
realty_property = Property(
|
||||||
address=Address(
|
address=Address(
|
||||||
street_address=result["location"]["address"]["line"],
|
street_address=street_address,
|
||||||
city=result["location"]["address"]["city"],
|
city=result["location"]["address"]["city"],
|
||||||
state=result["location"]["address"]["state_code"],
|
state=result["location"]["address"]["state_code"],
|
||||||
zip_code=result["location"]["address"]["postal_code"],
|
zip_code=result["location"]["address"]["postal_code"],
|
||||||
unit=parse_address_two(result["location"]["address"]["unit"]),
|
unit=parse_unit(result["location"]["address"]["unit"]),
|
||||||
country="USA",
|
country="USA",
|
||||||
),
|
),
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import json
|
import json
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two
|
from ....utils import parse_address_two, parse_unit
|
||||||
from ..models import Property, Address, PropertyType
|
from ..models import Property, Address, PropertyType
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,9 +39,10 @@ class RedfinScraper(Scraper):
|
||||||
return home[key]["value"]
|
return home[key]["value"]
|
||||||
|
|
||||||
if not single_search:
|
if not single_search:
|
||||||
unit = parse_address_two(get_value("streetLine"))
|
street_address, unit = parse_address_two(get_value("streetLine"))
|
||||||
|
unit = parse_unit(get_value("streetLine"))
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=get_value("streetLine"),
|
street_address=street_address,
|
||||||
city=home["city"],
|
city=home["city"],
|
||||||
state=home["state"],
|
state=home["state"],
|
||||||
zip_code=home["zip"],
|
zip_code=home["zip"],
|
||||||
|
@ -50,10 +51,11 @@ class RedfinScraper(Scraper):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
address_info = home["streetAddress"]
|
address_info = home["streetAddress"]
|
||||||
|
street_address, unit = parse_address_two(address_info["assembledAddress"])
|
||||||
unit = parse_address_two(address_info["assembledAddress"])
|
unit = parse_address_two(address_info["assembledAddress"])
|
||||||
|
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=address_info["assembledAddress"],
|
street_address=street_address,
|
||||||
city=home["city"],
|
city=home["city"],
|
||||||
state=home["state"],
|
state=home["state"],
|
||||||
zip_code=home["zip"],
|
zip_code=home["zip"],
|
||||||
|
@ -94,26 +96,30 @@ class RedfinScraper(Scraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_building(self, building: dict) -> Property:
|
def _parse_building(self, building: dict) -> Property:
|
||||||
|
street_address = " ".join(
|
||||||
|
[
|
||||||
|
building["address"]["streetNumber"],
|
||||||
|
building["address"]["directionalPrefix"],
|
||||||
|
building["address"]["streetName"],
|
||||||
|
building["address"]["streetType"],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
street_address, unit = parse_address_two(street_address)
|
||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_type=PropertyType("BUILDING"),
|
property_type=PropertyType("BUILDING"),
|
||||||
address=Address(
|
address=Address(
|
||||||
street_address=" ".join(
|
street_address=street_address,
|
||||||
[
|
|
||||||
building["address"]["streetNumber"],
|
|
||||||
building["address"]["directionalPrefix"],
|
|
||||||
building["address"]["streetName"],
|
|
||||||
building["address"]["streetType"],
|
|
||||||
]
|
|
||||||
),
|
|
||||||
city=building["address"]["city"],
|
city=building["address"]["city"],
|
||||||
state=building["address"]["stateOrProvinceCode"],
|
state=building["address"]["stateOrProvinceCode"],
|
||||||
zip_code=building["address"]["postalCode"],
|
zip_code=building["address"]["postalCode"],
|
||||||
unit=" ".join(
|
unit=parse_unit(
|
||||||
[
|
" ".join(
|
||||||
building["address"]["unitType"],
|
[
|
||||||
building["address"]["unitValue"],
|
building["address"]["unitType"],
|
||||||
]
|
building["address"]["unitValue"],
|
||||||
|
]
|
||||||
|
)
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
property_url="https://www.redfin.com{}".format(building["url"]),
|
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two
|
from ....utils import parse_address_two, parse_unit
|
||||||
from ....exceptions import NoResultsFound, PropertyNotFound
|
from ....exceptions import NoResultsFound, PropertyNotFound
|
||||||
from ..models import Property, Address, ListingType, PropertyType, SiteName
|
from ..models import Property, Address, ListingType, PropertyType, SiteName
|
||||||
|
|
||||||
|
@ -129,8 +129,8 @@ class ZillowScraper(Scraper):
|
||||||
if "hdpData" in result:
|
if "hdpData" in result:
|
||||||
home_info = result["hdpData"]["homeInfo"]
|
home_info = result["hdpData"]["homeInfo"]
|
||||||
address_data = {
|
address_data = {
|
||||||
"street_address": home_info["streetAddress"],
|
"street_address": parse_address_two(home_info["streetAddress"])[0],
|
||||||
"unit": parse_address_two(home_info["unit"])
|
"unit": parse_unit(home_info["unit"])
|
||||||
if "unit" in home_info
|
if "unit" in home_info
|
||||||
else None,
|
else None,
|
||||||
"city": home_info["city"],
|
"city": home_info["city"],
|
||||||
|
@ -225,9 +225,10 @@ class ZillowScraper(Scraper):
|
||||||
else property_data["hdpUrl"]
|
else property_data["hdpUrl"]
|
||||||
)
|
)
|
||||||
address_data = property_data["address"]
|
address_data = property_data["address"]
|
||||||
|
street_address, unit = parse_address_two(address_data["streetAddress"])
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=address_data["streetAddress"],
|
street_address=street_address,
|
||||||
unit=parse_address_two(address_data["streetAddress"]),
|
unit=unit,
|
||||||
city=address_data["city"],
|
city=address_data["city"],
|
||||||
state=address_data["state"],
|
state=address_data["state"],
|
||||||
zip_code=address_data["zipcode"],
|
zip_code=address_data["zipcode"],
|
||||||
|
@ -286,10 +287,11 @@ class ZillowScraper(Scraper):
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||||
|
|
||||||
|
street_address, unit = parse_address_two(street_address)
|
||||||
return Address(
|
return Address(
|
||||||
street_address=street_address,
|
street_address=street_address,
|
||||||
city=city,
|
city=city,
|
||||||
unit=parse_address_two(street_address),
|
unit=unit,
|
||||||
state=state,
|
state=state,
|
||||||
zip_code=zip_code,
|
zip_code=zip_code,
|
||||||
country="USA",
|
country="USA",
|
||||||
|
|
|
@ -1,7 +1,29 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def parse_address_two(street_address: str):
|
def parse_address_two(street_address: str) -> tuple:
|
||||||
|
if not street_address:
|
||||||
|
return street_address, None
|
||||||
|
|
||||||
|
apt_match = re.search(
|
||||||
|
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
|
||||||
|
street_address,
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
if apt_match:
|
||||||
|
apt_str = apt_match.group().strip()
|
||||||
|
cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
|
||||||
|
|
||||||
|
main_address = street_address.replace(
|
||||||
|
apt_str, ""
|
||||||
|
).strip() # Remove the matched part from the original address
|
||||||
|
return main_address, cleaned_apt_str
|
||||||
|
else:
|
||||||
|
return street_address, None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_unit(street_address: str):
|
||||||
if not street_address:
|
if not street_address:
|
||||||
return None
|
return None
|
||||||
apt_match = re.search(
|
apt_match = re.search(
|
||||||
|
@ -19,7 +41,7 @@ def parse_address_two(street_address: str):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(parse_address_two("810 E Colter St APT 32"))
|
print(parse_address_two("4303 E Cactus Rd Apt 126"))
|
||||||
print(parse_address_two("1234 Elm Street apt 2B"))
|
print(parse_address_two("1234 Elm Street apt 2B"))
|
||||||
print(parse_address_two("1234 Elm Street UNIT 3A"))
|
print(parse_address_two("1234 Elm Street UNIT 3A"))
|
||||||
print(parse_address_two("1234 Elm Street unit 3A"))
|
print(parse_address_two("1234 Elm Street unit 3A"))
|
||||||
|
|
|
@ -9,7 +9,7 @@ def test_realtor():
|
||||||
listing_type="for_sale",
|
listing_type="for_sale",
|
||||||
),
|
),
|
||||||
scrape_property(
|
scrape_property(
|
||||||
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent"
|
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
|
||||||
), #: does not support "city, state, USA" format
|
), #: does not support "city, state, USA" format
|
||||||
scrape_property(
|
scrape_property(
|
||||||
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
|
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
|
||||||
|
|
|
@ -7,7 +7,7 @@ def test_redfin():
|
||||||
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
|
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
|
||||||
),
|
),
|
||||||
scrape_property(
|
scrape_property(
|
||||||
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent"
|
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
|
||||||
),
|
),
|
||||||
scrape_property(
|
scrape_property(
|
||||||
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
|
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
|
||||||
|
|
|
@ -7,7 +7,7 @@ def test_zillow():
|
||||||
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
|
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
|
||||||
),
|
),
|
||||||
scrape_property(
|
scrape_property(
|
||||||
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent"
|
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
|
||||||
),
|
),
|
||||||
scrape_property(
|
scrape_property(
|
||||||
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
|
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
|
||||||
|
|
Loading…
Reference in New Issue