Merge pull request #2 from ZacharyHampton/all_3_sites

feat: run all 3 sites with one call
pull/3/head
Zachary Hampton 2023-09-18 15:17:50 -07:00 committed by GitHub
commit d0a6a66b6a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 80 additions and 40 deletions

View File

@ -1,11 +1,14 @@
import pandas as pd
from typing import Union
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from .core.scrapers import ScraperInput
from .core.scrapers.redfin import RedfinScraper from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, SiteName from .core.scrapers.models import ListingType, Property, SiteName
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType from .exceptions import InvalidSite, InvalidListingType
from typing import Union
import pandas as pd
_scrapers = { _scrapers = {
@ -91,21 +94,12 @@ def process_result(result: Property) -> pd.DataFrame:
return properties_df return properties_df
def scrape_property( def _scrape_single_site(
location: str, location: str, site_name: str, listing_type: str
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape property from various sites from a given location and listing type. Helper function to scrape a single site.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
""" """
validate_input(site_name, listing_type) validate_input(site_name, listing_type)
scraper_input = ScraperInput( scraper_input = ScraperInput(
@ -122,3 +116,46 @@ def scrape_property(
return pd.DataFrame() return pd.DataFrame()
return pd.concat(properties_dfs, ignore_index=True) return pd.concat(properties_dfs, ignore_index=True)
def scrape_property(
location: str,
site_name: Union[str, list[str]] = list(_scrapers.keys()),
listing_type: str = "for_sale",
) -> pd.DataFrame:
"""
Scrape property from various sites from a given location and listing type.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
"""
if site_name is None:
site_name = list(_scrapers.keys())
if not isinstance(site_name, list):
site_name = [site_name]
if len(site_name) == 1:
final_df = _scrape_single_site(location, site_name[0], listing_type)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
return final_df
results = []
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
for s_name in site_name
}
for future in concurrent.futures.as_completed(futures):
result = future.result()
results.append(result)
if not results:
return pd.DataFrame()
final_df = pd.concat(results, ignore_index=True)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
return final_df

View File

@ -240,7 +240,7 @@ class RealtorScraper(Scraper):
city=result["location"]["address"]["city"], city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"], state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"], zip_code=result["location"]["address"]["postal_code"],
unit=result["location"]["address"]["unit"], unit=parse_address_two(result["location"]["address"]["unit"]),
country="USA", country="USA",
), ),
site_name=self.site_name, site_name=self.site_name,

View File

@ -130,7 +130,9 @@ class ZillowScraper(Scraper):
home_info = result["hdpData"]["homeInfo"] home_info = result["hdpData"]["homeInfo"]
address_data = { address_data = {
"street_address": home_info["streetAddress"], "street_address": home_info["streetAddress"],
"unit": home_info.get("unit"), "unit": parse_address_two(home_info["unit"])
if "unit" in home_info
else None,
"city": home_info["city"], "city": home_info["city"],
"state": home_info["state"], "state": home_info["state"],
"zip_code": home_info["zipcode"], "zip_code": home_info["zipcode"],
@ -213,22 +215,6 @@ class ZillowScraper(Scraper):
return properties_list return properties_list
def _extract_units(self, result: dict):
units = {}
if "units" in result:
num_units = result.get("availabilityCount", len(result["units"]))
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in result["units"]
]
units["apt_availability_count"] = num_units
units["apt_min_unit_price"] = min(prices)
units["apt_max_unit_price"] = max(prices)
units["apt_avg_unit_price"] = (
sum(prices) // num_units if num_units else None
)
return units
def _get_single_property_page(self, property_data: dict): def _get_single_property_page(self, property_data: dict):
""" """
This method is used when a user enters the exact location & zillow returns just one property This method is used when a user enters the exact location & zillow returns just one property
@ -239,10 +225,9 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
unit = parse_address_two(address_data["streetAddress"])
address = Address( address = Address(
street_address=address_data["streetAddress"], street_address=address_data["streetAddress"],
unit=unit, unit=parse_address_two(address_data["streetAddress"]),
city=address_data["city"], city=address_data["city"],
state=address_data["state"], state=address_data["state"],
zip_code=address_data["zipcode"], zip_code=address_data["zipcode"],
@ -301,11 +286,10 @@ class ZillowScraper(Scraper):
else: else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}") raise ValueError(f"Unexpected state/zip format in address: {address_str}")
unit = parse_address_two(street_address)
return Address( return Address(
street_address=street_address, street_address=street_address,
city=city, city=city,
unit=unit, unit=parse_address_two(street_address),
state=state, state=state,
zip_code=zip_code, zip_code=zip_code,
country="USA", country="USA",

View File

@ -1,6 +1,25 @@
import re import re
def parse_address_two(address_one: str): def parse_address_two(street_address: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) if not street_address:
return apt_match.group().strip() if apt_match else None return None
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
street_address,
re.I,
)
if apt_match:
apt_str = apt_match.group().strip()
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
return apt_str
else:
return None
if __name__ == "__main__":
print(parse_address_two("810 E Colter St APT 32"))
print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A"))