Merge pull request #3 from ZacharyHampton/all_3_sites

Check dups with city, street_address, unit
pull/5/head
Zachary Hampton 2023-09-18 16:00:27 -07:00 committed by GitHub
commit d5b4d80f96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 68 additions and 35 deletions

View File

@ -158,5 +158,5 @@ def scrape_property(
return pd.DataFrame() return pd.DataFrame()
final_df = pd.concat(results, ignore_index=True) final_df = pd.concat(results, ignore_index=True)
final_df = final_df.drop_duplicates(subset="street_address", keep="first") final_df = final_df.drop_duplicates(subset=["street_address", "city", "unit"], keep="first")
return final_df return final_df

View File

@ -3,7 +3,7 @@ from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any, Generator from typing import Any, Generator
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
from ....utils import parse_address_two from ....utils import parse_address_two, parse_unit
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -108,8 +108,7 @@ class RealtorScraper(Scraper):
response_json = response.json() response_json = response.json()
property_info = response_json["data"]["property"] property_info = response_json["data"]["property"]
street_address = property_info["address"]["line"] street_address, unit = parse_address_two(property_info["address"]["line"])
unit = parse_address_two(street_address)
return [ return [
Property( Property(
@ -234,13 +233,16 @@ class RealtorScraper(Scraper):
return [] return []
for result in response_json["data"]["home_search"]["results"]: for result in response_json["data"]["home_search"]["results"]:
street_address, unit = parse_address_two(
result["location"]["address"]["line"]
)
realty_property = Property( realty_property = Property(
address=Address( address=Address(
street_address=result["location"]["address"]["line"], street_address=street_address,
city=result["location"]["address"]["city"], city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"], state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"], zip_code=result["location"]["address"]["postal_code"],
unit=parse_address_two(result["location"]["address"]["unit"]), unit=parse_unit(result["location"]["address"]["unit"]),
country="USA", country="USA",
), ),
site_name=self.site_name, site_name=self.site_name,

View File

@ -1,7 +1,7 @@
import json import json
from typing import Any from typing import Any
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two from ....utils import parse_address_two, parse_unit
from ..models import Property, Address, PropertyType from ..models import Property, Address, PropertyType
@ -39,9 +39,10 @@ class RedfinScraper(Scraper):
return home[key]["value"] return home[key]["value"]
if not single_search: if not single_search:
unit = parse_address_two(get_value("streetLine")) street_address, unit = parse_address_two(get_value("streetLine"))
unit = parse_unit(get_value("streetLine"))
address = Address( address = Address(
street_address=get_value("streetLine"), street_address=street_address,
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
@ -50,10 +51,11 @@ class RedfinScraper(Scraper):
) )
else: else:
address_info = home["streetAddress"] address_info = home["streetAddress"]
street_address, unit = parse_address_two(address_info["assembledAddress"])
unit = parse_address_two(address_info["assembledAddress"]) unit = parse_address_two(address_info["assembledAddress"])
address = Address( address = Address(
street_address=address_info["assembledAddress"], street_address=street_address,
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
@ -94,26 +96,30 @@ class RedfinScraper(Scraper):
) )
def _parse_building(self, building: dict) -> Property: def _parse_building(self, building: dict) -> Property:
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
street_address, unit = parse_address_two(street_address)
return Property( return Property(
site_name=self.site_name, site_name=self.site_name,
property_type=PropertyType("BUILDING"), property_type=PropertyType("BUILDING"),
address=Address( address=Address(
street_address=" ".join( street_address=street_address,
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
),
city=building["address"]["city"], city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"], state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"], zip_code=building["address"]["postalCode"],
unit=" ".join( unit=parse_unit(
[ " ".join(
building["address"]["unitType"], [
building["address"]["unitValue"], building["address"]["unitType"],
] building["address"]["unitValue"],
]
)
), ),
), ),
property_url="https://www.redfin.com{}".format(building["url"]), property_url="https://www.redfin.com{}".format(building["url"]),

View File

@ -1,7 +1,7 @@
import re import re
import json import json
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two from ....utils import parse_address_two, parse_unit
from ....exceptions import NoResultsFound, PropertyNotFound from ....exceptions import NoResultsFound, PropertyNotFound
from ..models import Property, Address, ListingType, PropertyType, SiteName from ..models import Property, Address, ListingType, PropertyType, SiteName
@ -129,8 +129,8 @@ class ZillowScraper(Scraper):
if "hdpData" in result: if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"] home_info = result["hdpData"]["homeInfo"]
address_data = { address_data = {
"street_address": home_info["streetAddress"], "street_address": parse_address_two(home_info["streetAddress"])[0],
"unit": parse_address_two(home_info["unit"]) "unit": parse_unit(home_info["unit"])
if "unit" in home_info if "unit" in home_info
else None, else None,
"city": home_info["city"], "city": home_info["city"],
@ -225,9 +225,10 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
street_address, unit = parse_address_two(address_data["streetAddress"])
address = Address( address = Address(
street_address=address_data["streetAddress"], street_address=street_address,
unit=parse_address_two(address_data["streetAddress"]), unit=unit,
city=address_data["city"], city=address_data["city"],
state=address_data["state"], state=address_data["state"],
zip_code=address_data["zipcode"], zip_code=address_data["zipcode"],
@ -286,10 +287,11 @@ class ZillowScraper(Scraper):
else: else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}") raise ValueError(f"Unexpected state/zip format in address: {address_str}")
street_address, unit = parse_address_two(street_address)
return Address( return Address(
street_address=street_address, street_address=street_address,
city=city, city=city,
unit=parse_address_two(street_address), unit=unit,
state=state, state=state,
zip_code=zip_code, zip_code=zip_code,
country="USA", country="USA",

View File

@ -1,7 +1,29 @@
import re import re
def parse_address_two(street_address: str): def parse_address_two(street_address: str) -> tuple:
if not street_address:
return street_address, None
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
street_address,
re.I,
)
if apt_match:
apt_str = apt_match.group().strip()
cleaned_apt_str = re.sub(
r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I
)
main_address = street_address.replace(apt_str, "").strip()
return main_address, cleaned_apt_str
else:
return street_address, None
def parse_unit(street_address: str):
if not street_address: if not street_address:
return None return None
apt_match = re.search( apt_match = re.search(
@ -19,7 +41,8 @@ def parse_address_two(street_address: str):
if __name__ == "__main__": if __name__ == "__main__":
print(parse_address_two("810 E Colter St APT 32")) print(parse_address_two("4303 E Cactus Rd Apt 126"))
print(parse_address_two("1234 Elm Street apt 2B")) print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A")) print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A")) print(parse_address_two("1234 Elm Street unit 3A"))
print(parse_address_two("1234 Elm Street SuIte 3A"))

View File

@ -9,7 +9,7 @@ def test_realtor():
listing_type="for_sale", listing_type="for_sale",
), ),
scrape_property( scrape_property(
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent" location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
), #: does not support "city, state, USA" format ), #: does not support "city, state, USA" format
scrape_property( scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold" location="Dallas, TX", site_name="realtor.com", listing_type="sold"

View File

@ -7,7 +7,7 @@ def test_redfin():
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale" location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
), ),
scrape_property( scrape_property(
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent" location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
), ),
scrape_property( scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold" location="Dallas, TX, USA", site_name="redfin", listing_type="sold"

View File

@ -7,7 +7,7 @@ def test_zillow():
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale" location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
), ),
scrape_property( scrape_property(
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent" location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
), ),
scrape_property( scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold" location="Dallas, TX, USA", site_name="zillow", listing_type="sold"