Merge pull request #2 from ZacharyHampton/all_3_sites
feat: run all 3 sites with one callpull/3/head
commit
d0a6a66b6a
|
@ -1,11 +1,14 @@
|
|||
import pandas as pd
|
||||
from typing import Union
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from .core.scrapers import ScraperInput
|
||||
from .core.scrapers.redfin import RedfinScraper
|
||||
from .core.scrapers.realtor import RealtorScraper
|
||||
from .core.scrapers.zillow import ZillowScraper
|
||||
from .core.scrapers.models import ListingType, Property, SiteName
|
||||
from .core.scrapers import ScraperInput
|
||||
from .exceptions import InvalidSite, InvalidListingType
|
||||
from typing import Union
|
||||
import pandas as pd
|
||||
|
||||
|
||||
_scrapers = {
|
||||
|
@ -91,21 +94,12 @@ def process_result(result: Property) -> pd.DataFrame:
|
|||
return properties_df
|
||||
|
||||
|
||||
def scrape_property(
|
||||
location: str,
|
||||
site_name: str,
|
||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
||||
def _scrape_single_site(
|
||||
location: str, site_name: str, listing_type: str
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Scrape property from various sites from a given location and listing type.
|
||||
|
||||
:returns: pd.DataFrame
|
||||
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
|
||||
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
|
||||
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
|
||||
:return: pd.DataFrame containing properties
|
||||
Helper function to scrape a single site.
|
||||
"""
|
||||
|
||||
validate_input(site_name, listing_type)
|
||||
|
||||
scraper_input = ScraperInput(
|
||||
|
@ -122,3 +116,46 @@ def scrape_property(
|
|||
return pd.DataFrame()
|
||||
|
||||
return pd.concat(properties_dfs, ignore_index=True)
|
||||
|
||||
|
||||
def scrape_property(
|
||||
location: str,
|
||||
site_name: Union[str, list[str]] = list(_scrapers.keys()),
|
||||
listing_type: str = "for_sale",
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Scrape property from various sites from a given location and listing type.
|
||||
|
||||
:returns: pd.DataFrame
|
||||
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
|
||||
:param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
|
||||
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
|
||||
:return: pd.DataFrame containing properties
|
||||
"""
|
||||
if site_name is None:
|
||||
site_name = list(_scrapers.keys())
|
||||
|
||||
if not isinstance(site_name, list):
|
||||
site_name = [site_name]
|
||||
|
||||
if len(site_name) == 1:
|
||||
final_df = _scrape_single_site(location, site_name[0], listing_type)
|
||||
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
|
||||
return final_df
|
||||
|
||||
results = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
|
||||
for s_name in site_name
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
|
||||
if not results:
|
||||
return pd.DataFrame()
|
||||
final_df = pd.concat(results, ignore_index=True)
|
||||
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
|
||||
return final_df
|
||||
|
|
|
@ -240,7 +240,7 @@ class RealtorScraper(Scraper):
|
|||
city=result["location"]["address"]["city"],
|
||||
state=result["location"]["address"]["state_code"],
|
||||
zip_code=result["location"]["address"]["postal_code"],
|
||||
unit=result["location"]["address"]["unit"],
|
||||
unit=parse_address_two(result["location"]["address"]["unit"]),
|
||||
country="USA",
|
||||
),
|
||||
site_name=self.site_name,
|
||||
|
|
|
@ -130,7 +130,9 @@ class ZillowScraper(Scraper):
|
|||
home_info = result["hdpData"]["homeInfo"]
|
||||
address_data = {
|
||||
"street_address": home_info["streetAddress"],
|
||||
"unit": home_info.get("unit"),
|
||||
"unit": parse_address_two(home_info["unit"])
|
||||
if "unit" in home_info
|
||||
else None,
|
||||
"city": home_info["city"],
|
||||
"state": home_info["state"],
|
||||
"zip_code": home_info["zipcode"],
|
||||
|
@ -213,22 +215,6 @@ class ZillowScraper(Scraper):
|
|||
|
||||
return properties_list
|
||||
|
||||
def _extract_units(self, result: dict):
|
||||
units = {}
|
||||
if "units" in result:
|
||||
num_units = result.get("availabilityCount", len(result["units"]))
|
||||
prices = [
|
||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||
for unit in result["units"]
|
||||
]
|
||||
units["apt_availability_count"] = num_units
|
||||
units["apt_min_unit_price"] = min(prices)
|
||||
units["apt_max_unit_price"] = max(prices)
|
||||
units["apt_avg_unit_price"] = (
|
||||
sum(prices) // num_units if num_units else None
|
||||
)
|
||||
return units
|
||||
|
||||
def _get_single_property_page(self, property_data: dict):
|
||||
"""
|
||||
This method is used when a user enters the exact location & zillow returns just one property
|
||||
|
@ -239,10 +225,9 @@ class ZillowScraper(Scraper):
|
|||
else property_data["hdpUrl"]
|
||||
)
|
||||
address_data = property_data["address"]
|
||||
unit = parse_address_two(address_data["streetAddress"])
|
||||
address = Address(
|
||||
street_address=address_data["streetAddress"],
|
||||
unit=unit,
|
||||
unit=parse_address_two(address_data["streetAddress"]),
|
||||
city=address_data["city"],
|
||||
state=address_data["state"],
|
||||
zip_code=address_data["zipcode"],
|
||||
|
@ -301,11 +286,10 @@ class ZillowScraper(Scraper):
|
|||
else:
|
||||
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||
|
||||
unit = parse_address_two(street_address)
|
||||
return Address(
|
||||
street_address=street_address,
|
||||
city=city,
|
||||
unit=unit,
|
||||
unit=parse_address_two(street_address),
|
||||
state=state,
|
||||
zip_code=zip_code,
|
||||
country="USA",
|
||||
|
|
|
@ -1,6 +1,25 @@
|
|||
import re
|
||||
|
||||
|
||||
def parse_address_two(address_one: str):
|
||||
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
|
||||
return apt_match.group().strip() if apt_match else None
|
||||
def parse_address_two(street_address: str):
|
||||
if not street_address:
|
||||
return None
|
||||
apt_match = re.search(
|
||||
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
|
||||
street_address,
|
||||
re.I,
|
||||
)
|
||||
|
||||
if apt_match:
|
||||
apt_str = apt_match.group().strip()
|
||||
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
|
||||
return apt_str
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(parse_address_two("810 E Colter St APT 32"))
|
||||
print(parse_address_two("1234 Elm Street apt 2B"))
|
||||
print(parse_address_two("1234 Elm Street UNIT 3A"))
|
||||
print(parse_address_two("1234 Elm Street unit 3A"))
|
||||
|
|
Loading…
Reference in New Issue