HomeHarvest/homeharvest/__init__.py

100 lines
3.0 KiB
Python

from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building, SiteName
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType
from typing import Union
import pandas as pd
_scrapers = {
"redfin": RedfinScraper,
"realtor.com": RealtorScraper,
"zillow": ZillowScraper,
}
def scrape_property(
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> Union[list[Building], list[Property]]:
if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist."
)
scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
site_name=SiteName[site_name.upper()],
)
site = _scrapers[site_name.lower()](scraper_input)
results = site.search()
properties_dfs = []
for result in results:
prop_data = result.__dict__
address_data = prop_data["address"]
prop_data["site_name"] = prop_data["site_name"].value
prop_data["listing_type"] = prop_data["listing_type"].value
prop_data["property_type"] = prop_data["property_type"].value.lower()
prop_data["address_one"] = address_data.address_one
prop_data["city"] = address_data.city
prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code
prop_data["address_two"] = address_data.address_two
del prop_data["address"]
if isinstance(result, Property):
desired_order = [
"listing_type",
"address_one",
"city",
"state",
"zip_code",
"address_two",
"url",
"property_type",
"price",
"beds",
"baths",
"square_feet",
"price_per_square_foot",
"lot_size",
"stories",
"year_built",
"agent_name",
"mls_id",
"description",
]
elif isinstance(result, Building):
desired_order = [
"address_one",
"city",
"state",
"zip_code",
"address_two",
"url",
"num_units",
"min_unit_price",
"max_unit_price",
"avg_unit_price",
"listing_type",
]
properties_df = pd.DataFrame([prop_data])
properties_df = properties_df[desired_order]
properties_dfs.append(properties_df)
return pd.concat(properties_dfs, ignore_index=True)