enh: property type (#102)

2026-03-04 19:44:29 -08:00 · 2024-11-03 17:23:07 -06:00
parent 1f717bd9e3
commit 8e04f6b117
11 changed files with 274 additions and 241 deletions
--- a/examples/price_of_land.py
+++ b/examples/price_of_land.py
@@ -0,0 +1,104 @@
+"""
+This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files.
+It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code.
+"""
+
+import os
+import pandas as pd
+from homeharvest import scrape_property
+
+
+def get_property_details(zip: str, listing_type):
+    properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365)
+    if not properties.empty:
+        properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None)
+
+        properties = properties[properties["sqft"].isnull()]
+        properties["ppa"] = properties.apply(
+            lambda row: (
+                int(
+                    (
+                        row["sold_price"]
+                        if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD")
+                        else row["list_price"]
+                    )
+                    / row["lot_acres"]
+                )
+                if pd.notnull(row["lot_acres"])
+                and row["lot_acres"] > 0
+                and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"]))
+                else None
+            ),
+            axis=1,
+        )
+        properties["ppa"] = properties["ppa"].astype("Int64")
+        selected_columns = [
+            "property_url",
+            "property_id",
+            "style",
+            "status",
+            "street",
+            "city",
+            "state",
+            "zip_code",
+            "county",
+            "list_date",
+            "last_sold_date",
+            "list_price",
+            "sold_price",
+            "lot_sqft",
+            "lot_acres",
+            "ppa",
+        ]
+        properties = properties[selected_columns]
+    return properties
+
+
+def output_to_excel(zip_code, sold_df, pending_df):
+    root_folder = os.getcwd()
+    zip_folder = os.path.join(root_folder, "zips", zip_code)
+
+    # Create zip code folder if it doesn't exist
+    os.makedirs(zip_folder, exist_ok=True)
+
+    # Define file paths
+    sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx")
+    pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx")
+
+    # Save individual sold and pending files
+    sold_df.to_excel(sold_file, index=False)
+    pending_df.to_excel(pending_file, index=False)
+
+
+zip_codes = map(
+    str,
+    [
+        22920,
+        77024,
+        78028,
+        24553,
+        22967,
+        22971,
+        22922,
+        22958,
+        22969,
+        22949,
+        22938,
+        24599,
+        24562,
+        22976,
+        24464,
+        22964,
+        24581,
+    ],
+)
+
+combined_df = pd.DataFrame()
+for zip in zip_codes:
+    sold_df = get_property_details(zip, "sold")
+    pending_df = get_property_details(zip, "pending")
+    combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True)
+    output_to_excel(zip, sold_df, pending_df)
+
+combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx")
+combined_df.to_excel(combined_file, index=False)