diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 9c6d9c8..f9169a6 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -23,9 +23,7 @@ def _validate_input(site_name: str, listing_type: str) -> None: raise InvalidSite(f"Provided site, '{site_name}', does not exist.") if listing_type.upper() not in ListingType.__members__: - raise InvalidListingType( - f"Provided listing type, '{listing_type}', does not exist." - ) + raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.") def _get_ordered_properties(result: Property) -> list[str]: @@ -35,34 +33,26 @@ def _get_ordered_properties(result: Property) -> list[str]: "listing_type", "property_type", "status_text", - "currency", - "price", - "apt_min_price", - "apt_max_price", - "apt_min_sqft", - "apt_max_sqft", - "apt_min_beds", - "apt_max_beds", - "apt_min_baths", - "apt_max_baths", + "baths_min", + "baths_max", + "beds_min", + "beds_max", + "sqft_min", + "sqft_max", + "price_min", + "price_max", + "unit_count", "tax_assessed_value", - "square_feet", "price_per_sqft", - "beds", - "baths", "lot_area_value", "lot_area_unit", - "street_address", - "unit", + "address_one", + "address_two", "city", "state", "zip_code", - "country", "posted_time", - "bldg_min_beds", - "bldg_min_baths", - "bldg_min_area", - "bldg_unit_count", + "area_min", "bldg_name", "stories", "year_built", @@ -86,12 +76,11 @@ def _process_result(result: Property) -> pd.DataFrame: prop_data["property_type"] = None if "address" in prop_data: address_data = prop_data["address"] - prop_data["street_address"] = address_data.street_address - prop_data["unit"] = address_data.unit + prop_data["address_one"] = address_data.address_one + prop_data["address_two"] = address_data.address_two prop_data["city"] = address_data.city prop_data["state"] = address_data.state prop_data["zip_code"] = address_data.zip_code - prop_data["country"] = address_data.country del prop_data["address"] @@ -101,9 +90,7 @@ def _process_result(result: Property) -> pd.DataFrame: return properties_df -def _scrape_single_site( - location: str, site_name: str, listing_type: str, proxy: str = None -) -> pd.DataFrame: +def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame: """ Helper function to scrape a single site. """ @@ -120,9 +107,7 @@ def _scrape_single_site( results = site.search() properties_dfs = [_process_result(result) for result in results] - properties_dfs = [ - df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty - ] + properties_dfs = [df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty] if not properties_dfs: return pd.DataFrame() @@ -158,9 +143,7 @@ def scrape_property( else: with ThreadPoolExecutor() as executor: futures = { - executor.submit( - _scrape_single_site, location, s_name, listing_type, proxy - ): s_name + executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name for s_name in site_name } @@ -175,14 +158,12 @@ def scrape_property( final_df = pd.concat(results, ignore_index=True) - columns_to_track = ["street_address", "city", "unit"] + columns_to_track = ["address_one", "address_two", "city"] #: validate they exist, otherwise create them for col in columns_to_track: if col not in final_df.columns: final_df[col] = None - final_df = final_df.drop_duplicates( - subset=["street_address", "city", "unit"], keep="first" - ) + final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first") return final_df diff --git a/homeharvest/cli.py b/homeharvest/cli.py index df237cf..099873c 100644 --- a/homeharvest/cli.py +++ b/homeharvest/cli.py @@ -5,9 +5,7 @@ from homeharvest import scrape_property def main(): parser = argparse.ArgumentParser(description="Home Harvest Property Scraper") - parser.add_argument( - "location", type=str, help="Location to scrape (e.g., San Francisco, CA)" - ) + parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)") parser.add_argument( "-s", @@ -44,15 +42,11 @@ def main(): help="Name of the output file (without extension)", ) - parser.add_argument( - "-p", "--proxy", type=str, default=None, help="Proxy to use for scraping" - ) + parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping") args = parser.parse_args() - result = scrape_property( - args.location, args.site_name, args.listing_type, proxy=args.proxy - ) + result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy) if not args.filename: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 2f14381..95ed3e1 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -19,10 +19,7 @@ class Scraper: self.session = requests.Session() if scraper_input.proxy: proxy_url = scraper_input.proxy - proxies = { - "http": proxy_url, - "https": proxy_url - } + proxies = {"http": proxy_url, "https": proxy_url} self.session.proxies.update(proxies) self.listing_type = scraper_input.listing_type self.site_name = scraper_input.site_name diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index f87eb43..cd79e6b 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from enum import Enum +from typing import Tuple class SiteName(Enum): @@ -56,12 +57,11 @@ class PropertyType(Enum): @dataclass class Address: - street_address: str - city: str - state: str - zip_code: str - unit: str | None = None - country: str | None = None + address_one: str | None = None + address_two: str | None = "#" + city: str | None = None + state: str | None = None + zip_code: str | None = None @dataclass @@ -73,12 +73,7 @@ class Property: property_type: PropertyType | None = None # house for sale - price: int | None = None tax_assessed_value: int | None = None - currency: str | None = None - square_feet: int | None = None - beds: int | None = None - baths: float | None = None lot_area_value: float | None = None lot_area_unit: str | None = None stories: int | None = None @@ -90,23 +85,25 @@ class Property: img_src: str | None = None description: str | None = None status_text: str | None = None - latitude: float | None = None - longitude: float | None = None posted_time: str | None = None # building for sale bldg_name: str | None = None - bldg_unit_count: int | None = None - bldg_min_beds: int | None = None - bldg_min_baths: float | None = None - bldg_min_area: int | None = None + area_min: int | None = None - # apt - apt_min_beds: int | None = None - apt_max_beds: int | None = None - apt_min_baths: float | None = None - apt_max_baths: float | None = None - apt_min_price: int | None = None - apt_max_price: int | None = None - apt_min_sqft: int | None = None - apt_max_sqft: int | None = None + beds_min: int | None = None + beds_max: int | None = None + + baths_min: float | None = None + baths_max: float | None = None + + sqft_min: int | None = None + sqft_max: int | None = None + + price_min: int | None = None + price_max: int | None = None + + unit_count: int | None = None + + latitude: float | None = None + longitude: float | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index ccddf71..78ecc84 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -1,16 +1,23 @@ -import json +""" +homeharvest.realtor.__init__ +~~~~~~~~~~~~ + +This module implements the scraper for relator.com +""" from ..models import Property, Address from .. import Scraper -from typing import Any, Generator from ....exceptions import NoResultsFound -from ....utils import parse_address_two, parse_unit +from ....utils import parse_address_one, parse_address_two from concurrent.futures import ThreadPoolExecutor, as_completed class RealtorScraper(Scraper): def __init__(self, scraper_input): + self.counter = 1 super().__init__(scraper_input) - self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta" + self.search_url = ( + "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta" + ) def handle_location(self): headers = { @@ -50,6 +57,9 @@ class RealtorScraper(Scraper): return result[0] def handle_address(self, property_id: str) -> list[Property]: + """ + Handles a specific address & returns one property + """ query = """query Property($property_id: ID!) { property(id: $property_id) { property_id @@ -108,43 +118,45 @@ class RealtorScraper(Scraper): response_json = response.json() property_info = response_json["data"]["property"] - street_address, unit = parse_address_two(property_info["address"]["line"]) + address_one, address_two = parse_address_one(property_info["address"]["line"]) return [ Property( site_name=self.site_name, address=Address( - street_address=street_address, + address_one=address_one, + address_two=address_two, city=property_info["address"]["city"], state=property_info["address"]["state_code"], zip_code=property_info["address"]["postal_code"], - unit=unit, - country="USA", ), property_url="https://www.realtor.com/realestateandhomes-detail/" + property_info["details"]["permalink"], - beds=property_info["basic"]["beds"], - baths=property_info["basic"]["baths"], stories=property_info["details"]["stories"], year_built=property_info["details"]["year_built"], - square_feet=property_info["basic"]["sqft"], - price_per_sqft=property_info["basic"]["price"] - // property_info["basic"]["sqft"] - if property_info["basic"]["sqft"] is not None - and property_info["basic"]["price"] is not None + price_per_sqft=property_info["basic"]["price"] // property_info["basic"]["sqft"] + if property_info["basic"]["sqft"] is not None and property_info["basic"]["price"] is not None else None, - price=property_info["basic"]["price"], mls_id=property_id, listing_type=self.listing_type, lot_area_value=property_info["public_record"]["lot_size"] if property_info["public_record"] is not None else None, + beds_min=property_info["basic"]["beds"], + beds_max=property_info["basic"]["beds"], + baths_min=property_info["basic"]["baths"], + baths_max=property_info["basic"]["baths"], + sqft_min=property_info["basic"]["sqft"], + sqft_max=property_info["basic"]["sqft"], + price_min=property_info["basic"]["price"], + price_max=property_info["basic"]["price"], ) ] - def handle_area( - self, variables: dict, return_total: bool = False - ) -> list[Property] | int: + def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: + """ + Handles a location area & returns a list of properties + """ query = ( """query Home_search( $city: String, @@ -237,17 +249,15 @@ class RealtorScraper(Scraper): return [] for result in response_json["data"]["home_search"]["results"]: - street_address, unit = parse_address_two( - result["location"]["address"]["line"] - ) + self.counter += 1 + address_one, _ = parse_address_one(result["location"]["address"]["line"]) realty_property = Property( address=Address( - street_address=street_address, + address_one=address_one, city=result["location"]["address"]["city"], state=result["location"]["address"]["state_code"], zip_code=result["location"]["address"]["postal_code"], - unit=parse_unit(result["location"]["address"]["unit"]), - country="USA", + address_two=parse_address_two(result["location"]["address"]["unit"]), ), latitude=result["location"]["address"]["coordinate"]["lat"] if result @@ -264,20 +274,22 @@ class RealtorScraper(Scraper): and "lon" in result["location"]["address"]["coordinate"] else None, site_name=self.site_name, - property_url="https://www.realtor.com/realestateandhomes-detail/" - + result["property_id"], - beds=result["description"]["beds"], - baths=result["description"]["baths"], + property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"], stories=result["description"]["stories"], year_built=result["description"]["year_built"], - square_feet=result["description"]["sqft"], price_per_sqft=result["price_per_sqft"], - price=result["list_price"], mls_id=result["property_id"], listing_type=self.listing_type, lot_area_value=result["description"]["lot_sqft"], + beds_min=result["description"]["beds"], + beds_max=result["description"]["beds"], + baths_min=result["description"]["baths"], + baths_max=result["description"]["baths"], + sqft_min=result["description"]["sqft"], + sqft_max=result["description"]["sqft"], + price_min=result["list_price"], + price_max=result["list_price"], ) - properties.append(realty_property) return properties diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index c4e8756..3582cd0 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -1,7 +1,13 @@ +""" +homeharvest.redfin.__init__ +~~~~~~~~~~~~ + +This module implements the scraper for redfin.com +""" import json from typing import Any from .. import Scraper -from ....utils import parse_address_two, parse_unit +from ....utils import parse_address_two, parse_address_one from ..models import Property, Address, PropertyType, ListingType, SiteName from ....exceptions import NoResultsFound @@ -12,9 +18,7 @@ class RedfinScraper(Scraper): self.listing_type = scraper_input.listing_type def _handle_location(self): - url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format( - self.location - ) + url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(self.location) response = self.session.get(url) response_json = json.loads(response.text.replace("{}&&", "")) @@ -28,9 +32,7 @@ class RedfinScraper(Scraper): return "address" #: address, needs to be handled differently if "exactMatch" not in response_json["payload"]: - raise NoResultsFound( - "No results found for location: {}".format(self.location) - ) + raise NoResultsFound("No results found for location: {}".format(self.location)) if response_json["payload"]["exactMatch"] is not None: target = response_json["payload"]["exactMatch"] @@ -45,39 +47,30 @@ class RedfinScraper(Scraper): return home[key]["value"] if not single_search: - street_address, unit = parse_address_two(get_value("streetLine")) - unit = parse_unit(get_value("streetLine")) address = Address( - street_address=street_address, + address_one=parse_address_one(get_value("streetLine"))[0], + address_two=parse_address_one(get_value("streetLine"))[1], city=home.get("city"), state=home.get("state"), zip_code=home.get("zip"), - unit=unit, - country="USA", ) else: address_info = home.get("streetAddress") - street_address, unit = parse_address_two(address_info.get("assembledAddress")) + address_one, address_two = parse_address_one(address_info.get("assembledAddress")) address = Address( - street_address=street_address, + address_one=address_one, + address_two=address_two, city=home.get("city"), state=home.get("state"), zip_code=home.get("zip"), - unit=unit, - country="USA", ) url = "https://www.redfin.com{}".format(home["url"]) - #: property_type = home["propertyType"] if "propertyType" in home else None lot_size_data = home.get("lotSize") if not isinstance(lot_size_data, int): - lot_size = ( - lot_size_data.get("value", None) - if isinstance(lot_size_data, dict) - else None - ) + lot_size = lot_size_data.get("value", None) if isinstance(lot_size_data, dict) else None else: lot_size = lot_size_data @@ -86,26 +79,24 @@ class RedfinScraper(Scraper): listing_type=self.listing_type, address=address, property_url=url, - beds=home["beds"] if "beds" in home else None, - baths=home["baths"] if "baths" in home else None, + beds_min=home["beds"] if "beds" in home else None, + beds_max=home["beds"] if "beds" in home else None, + baths_min=home["baths"] if "baths" in home else None, + baths_max=home["baths"] if "baths" in home else None, + price_min=get_value("price"), + price_max=get_value("price"), + sqft_min=get_value("sqFt"), + sqft_max=get_value("sqFt"), stories=home["stories"] if "stories" in home else None, agent_name=get_value("listingAgent"), description=home["listingRemarks"] if "listingRemarks" in home else None, - year_built=get_value("yearBuilt") - if not single_search - else home["yearBuilt"], - square_feet=get_value("sqFt"), + year_built=get_value("yearBuilt") if not single_search else home["yearBuilt"], lot_area_value=lot_size, property_type=PropertyType.from_int_code(home.get("propertyType")), price_per_sqft=get_value("pricePerSqFt"), - price=get_value("price"), mls_id=get_value("mlsId"), - latitude=home["latLong"]["latitude"] - if "latLong" in home and "latitude" in home["latLong"] - else None, - longitude=home["latLong"]["longitude"] - if "latLong" in home and "longitude" in home["latLong"] - else None, + latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None, + longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None, ) def _handle_rentals(self, region_id, region_type): @@ -125,12 +116,10 @@ class RedfinScraper(Scraper): address_info = home_data.get("addressInfo", {}) centroid = address_info.get("centroid", {}).get("centroid", {}) address = Address( - street_address=address_info.get("formattedStreetLine", None), - city=address_info.get("city", None), - state=address_info.get("state", None), - zip_code=address_info.get("zip", None), - unit=None, - country="US" if address_info.get("countryCode", None) == 1 else None, + address_one=parse_address_one(address_info.get("formattedStreetLine"))[0], + city=address_info.get("city"), + state=address_info.get("state"), + zip_code=address_info.get("zip"), ) price_range = rental_data.get("rentPriceRange", {"min": None, "max": None}) @@ -143,20 +132,20 @@ class RedfinScraper(Scraper): site_name=SiteName.REDFIN, listing_type=ListingType.FOR_RENT, address=address, - apt_min_beds=bed_range.get("min", None), - apt_min_baths=bath_range.get("min", None), - apt_max_beds=bed_range.get("max", None), - apt_max_baths=bath_range.get("max", None), - description=rental_data.get("description", None), - latitude=centroid.get("latitude", None), - longitude=centroid.get("longitude", None), - apt_min_price=price_range.get("min", None), - apt_max_price=price_range.get("max", None), - apt_min_sqft=sqft_range.get("min", None), - apt_max_sqft=sqft_range.get("max", None), - img_src=home_data.get("staticMapUrl", None), - posted_time=rental_data.get("lastUpdated", None), - bldg_name=rental_data.get("propertyName", None), + description=rental_data.get("description"), + latitude=centroid.get("latitude"), + longitude=centroid.get("longitude"), + baths_min=bath_range.get("min"), + baths_max=bath_range.get("max"), + beds_min=bed_range.get("min"), + beds_max=bed_range.get("max"), + price_min=price_range.get("min"), + price_max=price_range.get("max"), + sqft_min=sqft_range.get("min"), + sqft_max=sqft_range.get("max"), + img_src=home_data.get("staticMapUrl"), + posted_time=rental_data.get("lastUpdated"), + bldg_name=rental_data.get("propertyName"), ) properties_list.append(property_) @@ -175,16 +164,15 @@ class RedfinScraper(Scraper): building["address"]["streetType"], ] ) - street_address, unit = parse_address_two(street_address) return Property( site_name=self.site_name, property_type=PropertyType("BUILDING"), address=Address( - street_address=street_address, + address_one=parse_address_one(street_address)[0], city=building["address"]["city"], state=building["address"]["stateOrProvinceCode"], zip_code=building["address"]["postalCode"], - unit=parse_unit( + address_two=parse_address_two( " ".join( [ building["address"]["unitType"], @@ -195,7 +183,7 @@ class RedfinScraper(Scraper): ), property_url="https://www.redfin.com{}".format(building["url"]), listing_type=self.listing_type, - bldg_unit_count=building["numUnitsForSale"], + unit_count=building["numUnitsForSale"], ) def handle_address(self, home_id: str): @@ -206,7 +194,6 @@ class RedfinScraper(Scraper): https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3 https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3 """ - url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format( home_id ) @@ -214,9 +201,7 @@ class RedfinScraper(Scraper): response = self.session.get(url) response_json = json.loads(response.text.replace("{}&&", "")) - parsed_home = self._parse_home( - response_json["payload"]["addressSectionInfo"], single_search=True - ) + parsed_home = self._parse_home(response_json["payload"]["addressSectionInfo"], single_search=True) return [parsed_home] def search(self): @@ -235,10 +220,7 @@ class RedfinScraper(Scraper): url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000" response = self.session.get(url) response_json = json.loads(response.text.replace("{}&&", "")) - homes = [ - self._parse_home(home) for home in response_json["payload"]["homes"] - ] + [ - self._parse_building(building) - for building in response_json["payload"]["buildings"].values() + homes = [self._parse_home(home) for home in response_json["payload"]["homes"]] + [ + self._parse_building(building) for building in response_json["payload"]["buildings"].values() ] return homes diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 0b7b70d..217e824 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -1,7 +1,13 @@ +""" +homeharvest.zillow.__init__ +~~~~~~~~~~~~ + +This module implements the scraper for zillow.com +""" import re import json from .. import Scraper -from ....utils import parse_address_two, parse_unit +from ....utils import parse_address_one, parse_address_two from ....exceptions import GeoCoordsNotFound, NoResultsFound from ..models import Property, Address, ListingType, PropertyType @@ -13,12 +19,13 @@ class ZillowScraper(Scraper): if not self.is_plausible_location(self.location): raise NoResultsFound("Invalid location input: {}".format(self.location)) - if self.listing_type == ListingType.FOR_SALE: - self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/" - elif self.listing_type == ListingType.FOR_RENT: - self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/" - else: - self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/" + listing_type_to_url_path = { + ListingType.FOR_SALE: "for_sale", + ListingType.FOR_RENT: "for_rent", + ListingType.SOLD: "recently_sold", + } + + self.url = f"https://www.zillow.com/homes/{listing_type_to_url_path[self.listing_type]}/{self.location}_rb/" def is_plausible_location(self, location: str) -> bool: url = ( @@ -31,9 +38,7 @@ class ZillowScraper(Scraper): return response.json()["results"] != [] def search(self): - resp = self.session.get( - self.url, headers=self._get_headers() - ) + resp = self.session.get(self.url, headers=self._get_headers()) resp.raise_for_status() content = resp.text @@ -43,9 +48,7 @@ class ZillowScraper(Scraper): re.DOTALL, ) if not match: - raise NoResultsFound( - "No results were found for Zillow with the given Location." - ) + raise NoResultsFound("No results were found for Zillow with the given Location.") json_str = match.group(1) data = json.loads(json_str) @@ -130,9 +133,7 @@ class ZillowScraper(Scraper): "wants": {"cat1": ["mapResults"]}, "isDebugRequest": False, } - resp = self.session.put( - url, headers=self._get_headers(), json=payload - ) + resp = self.session.put(url, headers=self._get_headers(), json=payload) resp.raise_for_status() a = resp.json() return self._parse_properties(resp.json()) @@ -146,87 +147,71 @@ class ZillowScraper(Scraper): if "hdpData" in result: home_info = result["hdpData"]["homeInfo"] address_data = { - "street_address": parse_address_two(home_info["streetAddress"])[0], - "unit": parse_unit(home_info["unit"]) - if "unit" in home_info - else None, + "address_one": parse_address_one(home_info["streetAddress"])[0], + "address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#", "city": home_info["city"], "state": home_info["state"], "zip_code": home_info["zipcode"], - "country": home_info["country"], } - property_data = { - "site_name": self.site_name, - "address": Address(**address_data), - "property_url": f"https://www.zillow.com{result['detailUrl']}", - "beds": int(home_info["bedrooms"]) - if "bedrooms" in home_info - else None, - "baths": home_info.get("bathrooms"), - "square_feet": int(home_info["livingArea"]) - if "livingArea" in home_info - else None, - "currency": home_info["currency"], - "price": home_info.get("price"), - "tax_assessed_value": int(home_info["taxAssessedValue"]) - if "taxAssessedValue" in home_info - else None, - "property_type": PropertyType(home_info["homeType"]), - "listing_type": ListingType( - home_info["statusType"] - if "statusType" in home_info - else self.listing_type + property_obj = Property( + site_name=self.site_name, + address=Address(**address_data), + property_url=f"https://www.zillow.com{result['detailUrl']}", + tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None, + property_type=PropertyType(home_info["homeType"]), + listing_type=ListingType( + home_info["statusType"] if "statusType" in home_info else self.listing_type ), - "lot_area_value": round(home_info["lotAreaValue"], 2) - if "lotAreaValue" in home_info - else None, - "lot_area_unit": home_info.get("lotAreaUnit"), - "latitude": result["latLong"]["latitude"], - "longitude": result["latLong"]["longitude"], - "status_text": result.get("statusText"), - "posted_time": result["variableData"]["text"] + status_text=result.get("statusText"), + posted_time=result["variableData"]["text"] if "variableData" in result and "text" in result["variableData"] and result["variableData"]["type"] == "TIME_ON_INFO" else None, - "img_src": result.get("imgSrc"), - "price_per_sqft": int(home_info["price"] // home_info["livingArea"]) - if "livingArea" in home_info - and home_info["livingArea"] != 0 - and "price" in home_info + price_min=home_info.get("price"), + price_max=home_info.get("price"), + beds_min=int(home_info["bedrooms"]) if "bedrooms" in home_info else None, + beds_max=int(home_info["bedrooms"]) if "bedrooms" in home_info else None, + baths_min=home_info.get("bathrooms"), + baths_max=home_info.get("bathrooms"), + sqft_min=int(home_info["livingArea"]) if "livingArea" in home_info else None, + sqft_max=int(home_info["livingArea"]) if "livingArea" in home_info else None, + price_per_sqft=int(home_info["price"] // home_info["livingArea"]) + if "livingArea" in home_info and home_info["livingArea"] != 0 and "price" in home_info else None, - } - property_obj = Property(**property_data) + latitude=result["latLong"]["latitude"], + longitude=result["latLong"]["longitude"], + lot_area_value=round(home_info["lotAreaValue"], 2) if "lotAreaValue" in home_info else None, + lot_area_unit=home_info.get("lotAreaUnit"), + img_src=result.get("imgSrc"), + ) + properties_list.append(property_obj) elif "isBuilding" in result: - price = result["price"] - building_data = { - "property_url": f"https://www.zillow.com{result['detailUrl']}", - "site_name": self.site_name, - "property_type": PropertyType("BUILDING"), - "listing_type": ListingType(result["statusType"]), - "img_src": result["imgSrc"], - "price": int(price.replace("From $", "").replace(",", "")) - if "From $" in price - else None, - "apt_min_price": int( - price.replace("$", "").replace(",", "").replace("+/mo", "") - ) - if "+/mo" in price - else None, - "address": self._extract_address(result["address"]), - "bldg_min_beds": result["minBeds"], - "currency": "USD", - "bldg_min_baths": result["minBaths"], - "bldg_min_area": result.get("minArea"), - "bldg_unit_count": result["unitCount"], - "bldg_name": result.get("communityName"), - "status_text": result["statusText"], - "latitude": result["latLong"]["latitude"], - "longitude": result["latLong"]["longitude"], - } - building_obj = Property(**building_data) + price_string = result["price"].replace("$", "").replace(",", "").replace("+/mo", "") + + match = re.search(r"(\d+)", price_string) + price_value = int(match.group(1)) if match else None + building_obj = Property( + property_url=f"https://www.zillow.com{result['detailUrl']}", + site_name=self.site_name, + property_type=PropertyType("BUILDING"), + listing_type=ListingType(result["statusType"]), + img_src=result["imgSrc"], + address=self._extract_address(result["address"]), + baths_min=result["minBaths"], + area_min=result.get("minArea"), + bldg_name=result.get("communityName"), + status_text=result["statusText"], + beds_min=result["minBeds"], + price_min=price_value if "+/mo" in result["price"] else None, + price_max=price_value if "+/mo" in result["price"] else None, + latitude=result["latLong"]["latitude"], + longitude=result["latLong"]["longitude"], + unit_count=result["unitCount"], + ) + properties_list.append(building_obj) return properties_list @@ -241,43 +226,41 @@ class ZillowScraper(Scraper): else property_data["hdpUrl"] ) address_data = property_data["address"] - street_address, unit = parse_address_two(address_data["streetAddress"]) + address_one, address_two = parse_address_one(address_data["streetAddress"]) address = Address( - street_address=street_address, - unit=unit, + address_one=address_one, + address_two=address_two if address_two else "#", city=address_data["city"], state=address_data["state"], zip_code=address_data["zipcode"], - country=property_data.get("country"), ) property_type = property_data.get("homeType", None) return Property( site_name=self.site_name, - address=address, property_url=url, - beds=property_data.get("bedrooms", None), - baths=property_data.get("bathrooms", None), - year_built=property_data.get("yearBuilt", None), - price=property_data.get("price", None), - tax_assessed_value=property_data.get("taxAssessedValue", None), + property_type=PropertyType(property_type), + listing_type=self.listing_type, + address=address, + year_built=property_data.get("yearBuilt"), + tax_assessed_value=property_data.get("taxAssessedValue"), + lot_area_value=property_data.get("lotAreaValue"), + lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None, + agent_name=property_data.get("attributionInfo", {}).get("agentName"), + stories=property_data.get("resoFacts", {}).get("stories"), + mls_id=property_data.get("attributionInfo", {}).get("mlsId"), + beds_min=property_data.get("bedrooms"), + beds_max=property_data.get("bedrooms"), + baths_min=property_data.get("bathrooms"), + baths_max=property_data.get("bathrooms"), + price_min=property_data.get("price"), + price_max=property_data.get("price"), + sqft_min=property_data.get("livingArea"), + sqft_max=property_data.get("livingArea"), + price_per_sqft=property_data.get("resoFacts", {}).get("pricePerSquareFoot"), latitude=property_data.get("latitude"), longitude=property_data.get("longitude"), img_src=property_data.get("streetViewTileImageUrlMediumAddress"), - currency=property_data.get("currency", None), - lot_area_value=property_data.get("lotAreaValue"), - lot_area_unit=property_data["lotAreaUnits"].lower() - if "lotAreaUnits" in property_data - else None, - agent_name=property_data.get("attributionInfo", {}).get("agentName", None), - stories=property_data.get("resoFacts", {}).get("stories", None), - description=property_data.get("description", None), - mls_id=property_data.get("attributionInfo", {}).get("mlsId", None), - price_per_sqft=property_data.get("resoFacts", {}).get( - "pricePerSquareFoot", None - ), - square_feet=property_data.get("livingArea", None), - property_type=PropertyType(property_type), - listing_type=self.listing_type, + description=property_data.get("description"), ) def _extract_address(self, address_str): @@ -290,7 +273,7 @@ class ZillowScraper(Scraper): if len(parts) != 3: raise ValueError(f"Unexpected address format: {address_str}") - street_address = parts[0].strip() + address_one = parts[0].strip() city = parts[1].strip() state_zip = parts[2].split(" ") @@ -303,14 +286,13 @@ class ZillowScraper(Scraper): else: raise ValueError(f"Unexpected state/zip format in address: {address_str}") - street_address, unit = parse_address_two(street_address) + address_one, address_two = parse_address_one(address_one) return Address( - street_address=street_address, + address_one=address_one, + address_two=address_two if address_two else "#", city=city, - unit=unit, state=state, zip_code=zip_code, - country="USA", ) @staticmethod diff --git a/homeharvest/utils.py b/homeharvest/utils.py index 4b68913..2aeedee 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -1,9 +1,9 @@ import re -def parse_address_two(street_address: str) -> tuple: +def parse_address_one(street_address: str) -> tuple: if not street_address: - return street_address, None + return street_address, "#" apt_match = re.search( r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$", @@ -13,36 +13,26 @@ def parse_address_two(street_address: str) -> tuple: if apt_match: apt_str = apt_match.group().strip() - cleaned_apt_str = re.sub( - r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I - ) + cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I) main_address = street_address.replace(apt_str, "").strip() return main_address, cleaned_apt_str else: - return street_address, None + return street_address, "#" -def parse_unit(street_address: str): +def parse_address_two(street_address: str): if not street_address: - return None + return "#" apt_match = re.search( - r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$", + r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$", street_address, re.I, ) if apt_match: apt_str = apt_match.group().strip() - apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I) + apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I) return apt_str else: - return None - - -if __name__ == "__main__": - print(parse_address_two("4303 E Cactus Rd Apt 126")) - print(parse_address_two("1234 Elm Street apt 2B")) - print(parse_address_two("1234 Elm Street UNIT 3A")) - print(parse_address_two("1234 Elm Street unit 3A")) - print(parse_address_two("1234 Elm Street SuIte 3A")) + return "#" diff --git a/tests/test_redfin.py b/tests/test_redfin.py index 8d4c194..b55b442 100644 --- a/tests/test_redfin.py +++ b/tests/test_redfin.py @@ -9,15 +9,9 @@ from homeharvest.exceptions import ( def test_redfin(): results = [ - scrape_property( - location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale" - ), - scrape_property( - location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent" - ), - scrape_property( - location="Dallas, TX, USA", site_name="redfin", listing_type="sold" - ), + scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"), + scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"), + scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"), scrape_property(location="85281", site_name="redfin"), ] diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..d21ee77 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,24 @@ +from homeharvest.utils import parse_address_one, parse_address_two + + +def test_parse_address_one(): + test_data = [ + ("4303 E Cactus Rd Apt 126", ("4303 E Cactus Rd", "#126")), + ("1234 Elm Street apt 2B", ("1234 Elm Street", "#2B")), + ("1234 Elm Street UNIT 3A", ("1234 Elm Street", "#3A")), + ("1234 Elm Street unit 3A", ("1234 Elm Street", "#3A")), + ("1234 Elm Street SuIte 3A", ("1234 Elm Street", "#3A")), + ] + + for input_data, (exp_addr_one, exp_addr_two) in test_data: + address_one, address_two = parse_address_one(input_data) + assert address_one == exp_addr_one + assert address_two == exp_addr_two + + +def test_parse_address_two(): + test_data = [("Apt 126", "#126"), ("apt 2B", "#2B"), ("UNIT 3A", "#3A"), ("unit 3A", "#3A"), ("SuIte 3A", "#3A")] + + for input_data, expected in test_data: + output = parse_address_two(input_data) + assert output == expected diff --git a/tests/test_zillow.py b/tests/test_zillow.py index 6a70eae..49c48ad 100644 --- a/tests/test_zillow.py +++ b/tests/test_zillow.py @@ -9,15 +9,9 @@ from homeharvest.exceptions import ( def test_zillow(): results = [ - scrape_property( - location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale" - ), - scrape_property( - location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent" - ), - scrape_property( - location="Dallas, TX, USA", site_name="zillow", listing_type="sold" - ), + scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"), + scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"), + scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"), scrape_property(location="85281", site_name="zillow"), ]