enh: property type (#102)

2026-03-04 11:34:32 -08:00 · 2024-11-03 17:23:07 -06:00
parent 1f717bd9e3
commit 8e04f6b117
11 changed files with 274 additions and 241 deletions
--- a/examples/HomeHarvest_Demo.ipynb
+++ b/examples/HomeHarvest_Demo.ipynb
@@ -1,141 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cb48903e-5021-49fe-9688-45cd0bc05d0f",
-   "metadata": {
-    "is_executing": true
-   },
-   "outputs": [],
-   "source": [
-    "from homeharvest import scrape_property\n",
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "156488ce-0d5f-43c5-87f4-c33e9c427860",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.set_option('display.max_columns', None)  # Show all columns\n",
-    "pd.set_option('display.max_rows', None)     # Show all rows\n",
-    "pd.set_option('display.width', None)        # Auto-adjust display width to fit console\n",
-    "pd.set_option('display.max_colwidth', 50)   # Limit max column width to 50 characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1c8b9744-8606-4e9b-8add-b90371a249a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# check for sale properties\n",
-    "scrape_property(\n",
-    "    location=\"dallas\",\n",
-    "    listing_type=\"for_sale\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aaf86093",
-   "metadata": {
-    "collapsed": false,
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# search a specific address\n",
-    "scrape_property(\n",
-    "    location=\"2530 Al Lipscomb Way\",\n",
-    "    listing_type=\"for_sale\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ab7b4c21-da1d-4713-9df4-d7425d8ce21e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# check rentals\n",
-    "scrape_property(\n",
-    "    location=\"chicago, illinois\",\n",
-    "    listing_type=\"for_rent\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "af280cd3",
-   "metadata": {
-    "collapsed": false,
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# check sold properties\n",
-    "properties = scrape_property(\n",
-    "    location=\"90210\",\n",
-    "    listing_type=\"sold\",\n",
-    "    past_days=10\n",
-    ")\n",
-    "display(properties)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "628c1ce2",
-   "metadata": {
-    "collapsed": false,
-    "is_executing": true,
-    "jupyter": {
-     "outputs_hidden": false
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# display clickable URLs\n",
-    "from IPython.display import display, HTML\n",
-    "properties['property_url'] = '<a href=\"' + properties['property_url'] + '\" target=\"_blank\">' + properties['property_url'] + '</a>'\n",
-    "\n",
-    "html = properties.to_html(escape=False)\n",
-    "truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
-    "display(HTML(truncate_width))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/examples/HomeHarvest_Demo.py
+++ b/examples/HomeHarvest_Demo.py
@@ -1,20 +0,0 @@
-from homeharvest import scrape_property
-from datetime import datetime
-
-# Generate filename based on current timestamp
-current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-filename = f"HomeHarvest_{current_timestamp}.csv"
-
-properties = scrape_property(
-    location="San Diego, CA",
-    listing_type="sold",  # or (for_sale, for_rent)
-    past_days=30,  # sold in last 30 days - listed in last x days if (for_sale, for_rent)
-    # pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
-    # mls_only=True,  # only fetch MLS listings
-    # proxy="http://user:pass@host:port"  # use a proxy to change your IP address
-)
-print(f"Number of properties: {len(properties)}")
-
-# Export to csv
-properties.to_csv(filename, index=False)
-print(properties.head())
--- a/examples/price_of_land.py
+++ b/examples/price_of_land.py
@@ -0,0 +1,104 @@
+"""
+This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files.
+It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code.
+"""
+
+import os
+import pandas as pd
+from homeharvest import scrape_property
+
+
+def get_property_details(zip: str, listing_type):
+    properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365)
+    if not properties.empty:
+        properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None)
+
+        properties = properties[properties["sqft"].isnull()]
+        properties["ppa"] = properties.apply(
+            lambda row: (
+                int(
+                    (
+                        row["sold_price"]
+                        if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD")
+                        else row["list_price"]
+                    )
+                    / row["lot_acres"]
+                )
+                if pd.notnull(row["lot_acres"])
+                and row["lot_acres"] > 0
+                and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"]))
+                else None
+            ),
+            axis=1,
+        )
+        properties["ppa"] = properties["ppa"].astype("Int64")
+        selected_columns = [
+            "property_url",
+            "property_id",
+            "style",
+            "status",
+            "street",
+            "city",
+            "state",
+            "zip_code",
+            "county",
+            "list_date",
+            "last_sold_date",
+            "list_price",
+            "sold_price",
+            "lot_sqft",
+            "lot_acres",
+            "ppa",
+        ]
+        properties = properties[selected_columns]
+    return properties
+
+
+def output_to_excel(zip_code, sold_df, pending_df):
+    root_folder = os.getcwd()
+    zip_folder = os.path.join(root_folder, "zips", zip_code)
+
+    # Create zip code folder if it doesn't exist
+    os.makedirs(zip_folder, exist_ok=True)
+
+    # Define file paths
+    sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx")
+    pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx")
+
+    # Save individual sold and pending files
+    sold_df.to_excel(sold_file, index=False)
+    pending_df.to_excel(pending_file, index=False)
+
+
+zip_codes = map(
+    str,
+    [
+        22920,
+        77024,
+        78028,
+        24553,
+        22967,
+        22971,
+        22922,
+        22958,
+        22969,
+        22949,
+        22938,
+        24599,
+        24562,
+        22976,
+        24464,
+        22964,
+        24581,
+    ],
+)
+
+combined_df = pd.DataFrame()
+for zip in zip_codes:
+    sold_df = get_property_details(zip, "sold")
+    pending_df = get_property_details(zip, "pending")
+    combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True)
+    output_to_excel(zip, sold_df, pending_df)
+
+combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx")
+combined_df.to_excel(combined_file, index=False)