From 8e04f6b1173882fcf75525443d59f5e918a8bc47 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 3 Nov 2024 17:23:07 -0600 Subject: [PATCH] enh: property type (#102) --- README.md | 17 ++- examples/HomeHarvest_Demo.ipynb | 141 ------------------ examples/HomeHarvest_Demo.py | 20 --- examples/price_of_land.py | 104 +++++++++++++ homeharvest/__init__.py | 9 +- homeharvest/core/scrapers/__init__.py | 68 +++++---- homeharvest/core/scrapers/models.py | 13 ++ homeharvest/core/scrapers/realtor/__init__.py | 94 ++++++++---- poetry.lock | 19 ++- pyproject.toml | 3 +- tests/test_realtor.py | 27 ++-- 11 files changed, 274 insertions(+), 241 deletions(-) delete mode 100644 examples/HomeHarvest_Demo.ipynb delete mode 100644 examples/HomeHarvest_Demo.py create mode 100644 examples/price_of_land.py diff --git a/README.md b/README.md index ad72eed..f522d1d 100644 --- a/README.md +++ b/README.md @@ -68,13 +68,24 @@ print(properties.head()) ``` Required ├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc. -└── listing_type (option): Choose the type of listing. +├── listing_type (option): Choose the type of listing. - 'for_rent' - 'for_sale' - 'sold' - - 'pending' + - 'pending' (for pending/contingent sales) Optional +├── property_type (list): Choose the type of properties. + - 'single_family' + - 'multi_family' + - 'condos' + - 'condo_townhome_rowhome_coop' + - 'condo_townhome' + - 'townhomes' + - 'duplex_triplex' + - 'farm' + - 'land' + - 'mobile' ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses. │ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored) │ @@ -94,7 +105,7 @@ Optional │ ├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data for general searches (e.g. schools, tax appraisals etc.) │ -├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' +├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending' │ └── limit (integer): Limit the number of properties to fetch. Max & default is 10000. ``` diff --git a/examples/HomeHarvest_Demo.ipynb b/examples/HomeHarvest_Demo.ipynb deleted file mode 100644 index 9e6aa49..0000000 --- a/examples/HomeHarvest_Demo.ipynb +++ /dev/null @@ -1,141 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "cb48903e-5021-49fe-9688-45cd0bc05d0f", - "metadata": { - "is_executing": true - }, - "outputs": [], - "source": [ - "from homeharvest import scrape_property\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "156488ce-0d5f-43c5-87f4-c33e9c427860", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None) # Show all columns\n", - "pd.set_option('display.max_rows', None) # Show all rows\n", - "pd.set_option('display.width', None) # Auto-adjust display width to fit console\n", - "pd.set_option('display.max_colwidth', 50) # Limit max column width to 50 characters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c8b9744-8606-4e9b-8add-b90371a249a7", - "metadata": {}, - "outputs": [], - "source": [ - "# check for sale properties\n", - "scrape_property(\n", - " location=\"dallas\",\n", - " listing_type=\"for_sale\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aaf86093", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# search a specific address\n", - "scrape_property(\n", - " location=\"2530 Al Lipscomb Way\",\n", - " listing_type=\"for_sale\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab7b4c21-da1d-4713-9df4-d7425d8ce21e", - "metadata": {}, - "outputs": [], - "source": [ - "# check rentals\n", - "scrape_property(\n", - " location=\"chicago, illinois\",\n", - " listing_type=\"for_rent\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af280cd3", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# check sold properties\n", - "properties = scrape_property(\n", - " location=\"90210\",\n", - " listing_type=\"sold\",\n", - " past_days=10\n", - ")\n", - "display(properties)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "628c1ce2", - "metadata": { - "collapsed": false, - "is_executing": true, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# display clickable URLs\n", - "from IPython.display import display, HTML\n", - "properties['property_url'] = '' + properties['property_url'] + ''\n", - "\n", - "html = properties.to_html(escape=False)\n", - "truncate_width = f'{html}'\n", - "display(HTML(truncate_width))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/HomeHarvest_Demo.py b/examples/HomeHarvest_Demo.py deleted file mode 100644 index 46d31b1..0000000 --- a/examples/HomeHarvest_Demo.py +++ /dev/null @@ -1,20 +0,0 @@ -from homeharvest import scrape_property -from datetime import datetime - -# Generate filename based on current timestamp -current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -filename = f"HomeHarvest_{current_timestamp}.csv" - -properties = scrape_property( - location="San Diego, CA", - listing_type="sold", # or (for_sale, for_rent) - past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent) - # pending_or_contingent=True # use on for_sale listings to find pending / contingent listings - # mls_only=True, # only fetch MLS listings - # proxy="http://user:pass@host:port" # use a proxy to change your IP address -) -print(f"Number of properties: {len(properties)}") - -# Export to csv -properties.to_csv(filename, index=False) -print(properties.head()) diff --git a/examples/price_of_land.py b/examples/price_of_land.py new file mode 100644 index 0000000..4b35e21 --- /dev/null +++ b/examples/price_of_land.py @@ -0,0 +1,104 @@ +""" +This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files. +It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code. +""" + +import os +import pandas as pd +from homeharvest import scrape_property + + +def get_property_details(zip: str, listing_type): + properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365) + if not properties.empty: + properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None) + + properties = properties[properties["sqft"].isnull()] + properties["ppa"] = properties.apply( + lambda row: ( + int( + ( + row["sold_price"] + if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD") + else row["list_price"] + ) + / row["lot_acres"] + ) + if pd.notnull(row["lot_acres"]) + and row["lot_acres"] > 0 + and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"])) + else None + ), + axis=1, + ) + properties["ppa"] = properties["ppa"].astype("Int64") + selected_columns = [ + "property_url", + "property_id", + "style", + "status", + "street", + "city", + "state", + "zip_code", + "county", + "list_date", + "last_sold_date", + "list_price", + "sold_price", + "lot_sqft", + "lot_acres", + "ppa", + ] + properties = properties[selected_columns] + return properties + + +def output_to_excel(zip_code, sold_df, pending_df): + root_folder = os.getcwd() + zip_folder = os.path.join(root_folder, "zips", zip_code) + + # Create zip code folder if it doesn't exist + os.makedirs(zip_folder, exist_ok=True) + + # Define file paths + sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx") + pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx") + + # Save individual sold and pending files + sold_df.to_excel(sold_file, index=False) + pending_df.to_excel(pending_file, index=False) + + +zip_codes = map( + str, + [ + 22920, + 77024, + 78028, + 24553, + 22967, + 22971, + 22922, + 22958, + 22969, + 22949, + 22938, + 24599, + 24562, + 22976, + 24464, + 22964, + 24581, + ], +) + +combined_df = pd.DataFrame() +for zip in zip_codes: + sold_df = get_property_details(zip, "sold") + pending_df = get_property_details(zip, "pending") + combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True) + output_to_excel(zip, sold_df, pending_df) + +combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx") +combined_df.to_excel(combined_file, index=False) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index b4950c2..643e954 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -3,12 +3,13 @@ import pandas as pd from .core.scrapers import ScraperInput from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit from .core.scrapers.realtor import RealtorScraper -from .core.scrapers.models import ListingType +from .core.scrapers.models import ListingType, SearchPropertyType def scrape_property( location: str, listing_type: str = "for_sale", + property_type: list[str] | None = None, radius: float = None, mls_only: bool = False, past_days: int = None, @@ -24,6 +25,7 @@ def scrape_property( Scrape properties from Realtor.com based on a given location and listing type. :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param listing_type: Listing Type (for_sale, for_rent, sold, pending) + :param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile) :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param mls_only: If set, fetches only listings with MLS IDs. :param proxy: Proxy to use for scraping @@ -41,6 +43,7 @@ def scrape_property( scraper_input = ScraperInput( location=location, listing_type=ListingType[listing_type.upper()], + property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None, proxy=proxy, radius=radius, mls_only=mls_only, @@ -63,4 +66,6 @@ def scrape_property( with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) - return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA}) + return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace( + {"None": pd.NA, None: pd.NA, "": pd.NA} + ) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 745d766..1c68061 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -5,7 +5,7 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry import uuid from ...exceptions import AuthenticationError -from .models import Property, ListingType, SiteName +from .models import Property, ListingType, SiteName, SearchPropertyType import json @@ -13,6 +13,7 @@ import json class ScraperInput: location: str listing_type: ListingType + property_type: list[SearchPropertyType] | None = None radius: float | None = None mls_only: bool | None = False proxy: str | None = None @@ -34,11 +35,12 @@ class Scraper: ): self.location = scraper_input.location self.listing_type = scraper_input.listing_type + self.property_type = scraper_input.property_type if not self.session: Scraper.session = requests.Session() retries = Retry( - total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"]) + total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"]) ) adapter = HTTPAdapter(max_retries=retries) @@ -46,21 +48,21 @@ class Scraper: Scraper.session.mount("https://", adapter) Scraper.session.headers.update( { - 'accept': 'application/json, text/javascript', - 'accept-language': 'en-US,en;q=0.9', - 'cache-control': 'no-cache', - 'content-type': 'application/json', - 'origin': 'https://www.realtor.com', - 'pragma': 'no-cache', - 'priority': 'u=1, i', - 'rdc-ab-tests': 'commute_travel_time_variation:v1', - 'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-origin', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', + "accept": "application/json, text/javascript", + "accept-language": "en-US,en;q=0.9", + "cache-control": "no-cache", + "content-type": "application/json", + "origin": "https://www.realtor.com", + "pragma": "no-cache", + "priority": "u=1, i", + "rdc-ab-tests": "commute_travel_time_variation:v1", + "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36", } ) @@ -94,27 +96,29 @@ class Scraper: response = requests.post( "https://graph.realtor.com/auth/token", headers={ - 'Host': 'graph.realtor.com', - 'Accept': '*/*', - 'Content-Type': 'Application/json', - 'X-Client-ID': 'rdc_mobile_native,iphone', - 'X-Visitor-ID': device_id, - 'X-Client-Version': '24.21.23.679885', - 'Accept-Language': 'en-US,en;q=0.9', - 'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0', + "Host": "graph.realtor.com", + "Accept": "*/*", + "Content-Type": "Application/json", + "X-Client-ID": "rdc_mobile_native,iphone", + "X-Visitor-ID": device_id, + "X-Client-Version": "24.21.23.679885", + "Accept-Language": "en-US,en;q=0.9", + "User-Agent": "Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0", }, - data=json.dumps({ - "grant_type": "device_mobile", - "device_id": device_id, - "client_app_id": "rdc_mobile_native,24.21.23.679885,iphone" - })) + data=json.dumps( + { + "grant_type": "device_mobile", + "device_id": device_id, + "client_app_id": "rdc_mobile_native,24.21.23.679885,iphone", + } + ), + ) data = response.json() if not (access_token := data.get("access_token")): raise AuthenticationError( - "Failed to get access token, use a proxy/vpn or wait a moment and try again.", - response=response + "Failed to get access token, use a proxy/vpn or wait a moment and try again.", response=response ) return access_token diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 98be66b..67c4a68 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -17,6 +17,19 @@ class SiteName(Enum): raise ValueError(f"{value} not found in {cls}") +class SearchPropertyType(Enum): + SINGLE_FAMILY = "single_family" + CONDOS = "condos" + CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop" + CONDO_TOWNHOME = "condo_townhome" + TOWNHOMES = "townhomes" + DUPLEX_TRIPLEX = "duplex_triplex" + FARM = "farm" + LAND = "land" + MULTI_FAMILY = "multi_family" + MOBILE = "mobile" + + class ListingType(Enum): FOR_SALE = "FOR_SALE" FOR_RENT = "FOR_RENT" diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 355b5b3..e45c2db 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -6,12 +6,28 @@ This module implements the scraper for realtor.com """ from __future__ import annotations + +import json from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime +from json import JSONDecodeError from typing import Dict, Union, Optional +from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt + from .. import Scraper -from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker, Builder, Advertisers, Office +from ..models import ( + Property, + Address, + ListingType, + Description, + PropertyType, + Agent, + Broker, + Builder, + Advertisers, + Office, +) from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA @@ -81,9 +97,12 @@ class RealtorScraper(Scraper): return property_info["listings"][0]["listing_id"] def handle_home(self, property_id: str) -> list[Property]: - query = """query Home($property_id: ID!) { + query = ( + """query Home($property_id: ID!) { home(property_id: $property_id) %s - }""" % HOMES_DATA + }""" + % HOMES_DATA + ) variables = {"property_id": property_id} payload = { @@ -96,9 +115,7 @@ class RealtorScraper(Scraper): property_info = response_json["data"]["home"] - return [ - self.process_property(property_info, "home") - ] + return [self.process_property(property_info, "home")] @staticmethod def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None: @@ -122,7 +139,7 @@ class RealtorScraper(Scraper): phones=advertiser.get("phones"), ) - if advertiser.get('broker') and advertiser["broker"].get('name'): #: has a broker + if advertiser.get("broker") and advertiser["broker"].get("name"): #: has a broker processed_advertisers.broker = Broker( uuid=_parse_fulfillment_id(advertiser["broker"].get("fulfillment_id")), name=advertiser["broker"].get("name"), @@ -153,15 +170,16 @@ class RealtorScraper(Scraper): return able_to_get_lat_long = ( - result - and result.get("location") - and result["location"].get("address") - and result["location"]["address"].get("coordinate") + result + and result.get("location") + and result["location"].get("address") + and result["location"]["address"].get("coordinate") ) - is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") + is_pending = result["flags"].get("is_pending") + is_contingent = result["flags"].get("is_contingent") - if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING): + if (is_pending or is_contingent) and (self.exclude_pending and self.listing_type != ListingType.PENDING): return property_id = result["property_id"] @@ -184,7 +202,7 @@ class RealtorScraper(Scraper): property_url=result["href"], property_id=property_id, listing_id=result.get("listing_id"), - status="PENDING" if is_pending else result["status"].upper(), + status="PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper(), list_price=result["list_price"], list_price_min=result["list_price_min"], list_price_max=result["list_price_max"], @@ -225,6 +243,11 @@ class RealtorScraper(Scraper): elif self.last_x_days: date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}' + property_type_param = "" + if self.property_type: + property_types = [pt.value for pt in self.property_type] + property_type_param = f"type: {json.dumps(property_types)}" + sort_param = ( "sort: [{ field: sold_date, direction: desc }]" if self.listing_type == ListingType.SOLD @@ -259,6 +282,7 @@ class RealtorScraper(Scraper): status: %s %s %s + %s } %s limit: 200 @@ -268,6 +292,7 @@ class RealtorScraper(Scraper): is_foreclosure, listing_type.value.lower(), date_param, + property_type_param, pending_or_contingent_param, sort_param, GENERAL_RESULTS_QUERY, @@ -290,6 +315,7 @@ class RealtorScraper(Scraper): status: %s %s %s + %s } %s limit: 200 @@ -299,13 +325,14 @@ class RealtorScraper(Scraper): is_foreclosure, listing_type.value.lower(), date_param, + property_type_param, pending_or_contingent_param, sort_param, GENERAL_RESULTS_QUERY, ) else: #: general search, came from an address query = ( - """query Property_search( + """query Property_search( $property_id: [ID]! $offset: Int!, ) { @@ -315,9 +342,9 @@ class RealtorScraper(Scraper): } limit: 1 offset: $offset - ) %s + ) %s }""" - % GENERAL_RESULTS_QUERY + % GENERAL_RESULTS_QUERY ) payload = { @@ -332,12 +359,12 @@ class RealtorScraper(Scraper): properties: list[Property] = [] if ( - response_json is None - or "data" not in response_json - or response_json["data"] is None - or search_key not in response_json["data"] - or response_json["data"][search_key] is None - or "results" not in response_json["data"][search_key] + response_json is None + or "data" not in response_json + or response_json["data"] is None + or search_key not in response_json["data"] + or response_json["data"][search_key] is None + or "results" not in response_json["data"][search_key] ): return {"total": 0, "properties": []} @@ -347,12 +374,10 @@ class RealtorScraper(Scraper): #: limit the number of properties to be processed #: example, if your offset is 200, and your limit is 250, return 50 - properties_list = properties_list[:self.limit - offset] + properties_list = properties_list[: self.limit - offset] with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: - futures = [ - executor.submit(self.process_property, result, search_key) for result in properties_list - ] + futures = [executor.submit(self.process_property, result, search_key) for result in properties_list] for future in as_completed(futures): result = future.result() @@ -451,6 +476,9 @@ class RealtorScraper(Scraper): "assessed_value": assessed_value if assessed_value else None, } + @retry( + retry=retry_if_exception_type(JSONDecodeError), wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3) + ) def get_prop_details(self, property_id: str) -> dict: if not self.extra_property_data: return {} @@ -534,7 +562,9 @@ class RealtorScraper(Scraper): style = style.upper() primary_photo = "" - if (primary_photo_info := result.get('primary_photo')) and (primary_photo_href := primary_photo_info.get("href")): + if (primary_photo_info := result.get("primary_photo")) and ( + primary_photo_href := primary_photo_info.get("href") + ): primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") return Description( @@ -547,7 +577,7 @@ class RealtorScraper(Scraper): sqft=description_data.get("sqft"), lot_sqft=description_data.get("lot_sqft"), sold_price=( - result.get('last_sold_price') or description_data.get("sold_price") + result.get("last_sold_price") or description_data.get("sold_price") if result.get("last_sold_date") or result["list_price"] != description_data.get("sold_price") else None ), #: has a sold date or list and sold price are different @@ -581,4 +611,8 @@ class RealtorScraper(Scraper): if not photos_info: return None - return [photo_info["href"].replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") for photo_info in photos_info if photo_info.get("href")] + return [ + photo_info["href"].replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") + for photo_info in photos_info + if photo_info.get("href") + ] diff --git a/poetry.lock b/poetry.lock index b3bff2a..98dc4de 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "annotated-types" @@ -667,6 +667,21 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "tenacity" +version = "9.0.0" +description = "Retry code until it succeeds" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"}, + {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"}, +] + +[package.extras] +doc = ["reno", "sphinx"] +test = ["pytest", "tornado (>=4.5)", "typeguard"] + [[package]] name = "tomli" version = "2.0.1" @@ -740,4 +755,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "21ef9cfb35c446a375a2b74c37691d7031afb1e4f66a8b63cb7c1669470689d2" +content-hash = "cefc11b1bf5ad99d628f6d08f6f03003522cc1b6e48b519230d99d716a5c165c" diff --git a/pyproject.toml b/pyproject.toml index 2f0d8cc..56d5bca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.4.3" +version = "0.4.4" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest" @@ -14,6 +14,7 @@ python = ">=3.9,<3.13" requests = "^2.31.0" pandas = "^2.1.1" pydantic = "^2.7.4" +tenacity = "^9.0.0" [tool.poetry.group.dev.dependencies] diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 284875a..df2249e 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -105,8 +105,12 @@ def test_realtor(): location="2530 Al Lipscomb Way", listing_type="for_sale", ), - scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format - scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format + scrape_property( + location="Phoenix, AZ", listing_type="for_rent", limit=1000 + ), #: does not support "city, state, USA" format + scrape_property( + location="Dallas, TX", listing_type="sold", limit=1000 + ), #: does not support "city, state, USA" format scrape_property(location="85281"), ] @@ -114,11 +118,13 @@ def test_realtor(): def test_realtor_city(): - results = scrape_property( - location="Atlanta, GA", - listing_type="for_sale", - limit=1000 - ) + results = scrape_property(location="Atlanta, GA", listing_type="for_sale", limit=1000) + + assert results is not None and len(results) > 0 + + +def test_realtor_land(): + results = scrape_property(location="Atlanta, GA", listing_type="for_sale", property_type=["land"], limit=1000) assert results is not None and len(results) > 0 @@ -241,9 +247,10 @@ def test_apartment_list_price(): results = results[results["style"] == "APARTMENT"] #: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max - assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len( - results - ) > 0.5 + assert ( + len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(results) + > 0.5 + ) def test_builder_exists():