105 lines
3.1 KiB
Python
105 lines
3.1 KiB
Python
|
"""
|
||
|
This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files.
|
||
|
It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code.
|
||
|
"""
|
||
|
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
from homeharvest import scrape_property
|
||
|
|
||
|
|
||
|
def get_property_details(zip: str, listing_type):
|
||
|
properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365)
|
||
|
if not properties.empty:
|
||
|
properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None)
|
||
|
|
||
|
properties = properties[properties["sqft"].isnull()]
|
||
|
properties["ppa"] = properties.apply(
|
||
|
lambda row: (
|
||
|
int(
|
||
|
(
|
||
|
row["sold_price"]
|
||
|
if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD")
|
||
|
else row["list_price"]
|
||
|
)
|
||
|
/ row["lot_acres"]
|
||
|
)
|
||
|
if pd.notnull(row["lot_acres"])
|
||
|
and row["lot_acres"] > 0
|
||
|
and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"]))
|
||
|
else None
|
||
|
),
|
||
|
axis=1,
|
||
|
)
|
||
|
properties["ppa"] = properties["ppa"].astype("Int64")
|
||
|
selected_columns = [
|
||
|
"property_url",
|
||
|
"property_id",
|
||
|
"style",
|
||
|
"status",
|
||
|
"street",
|
||
|
"city",
|
||
|
"state",
|
||
|
"zip_code",
|
||
|
"county",
|
||
|
"list_date",
|
||
|
"last_sold_date",
|
||
|
"list_price",
|
||
|
"sold_price",
|
||
|
"lot_sqft",
|
||
|
"lot_acres",
|
||
|
"ppa",
|
||
|
]
|
||
|
properties = properties[selected_columns]
|
||
|
return properties
|
||
|
|
||
|
|
||
|
def output_to_excel(zip_code, sold_df, pending_df):
|
||
|
root_folder = os.getcwd()
|
||
|
zip_folder = os.path.join(root_folder, "zips", zip_code)
|
||
|
|
||
|
# Create zip code folder if it doesn't exist
|
||
|
os.makedirs(zip_folder, exist_ok=True)
|
||
|
|
||
|
# Define file paths
|
||
|
sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx")
|
||
|
pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx")
|
||
|
|
||
|
# Save individual sold and pending files
|
||
|
sold_df.to_excel(sold_file, index=False)
|
||
|
pending_df.to_excel(pending_file, index=False)
|
||
|
|
||
|
|
||
|
zip_codes = map(
|
||
|
str,
|
||
|
[
|
||
|
22920,
|
||
|
77024,
|
||
|
78028,
|
||
|
24553,
|
||
|
22967,
|
||
|
22971,
|
||
|
22922,
|
||
|
22958,
|
||
|
22969,
|
||
|
22949,
|
||
|
22938,
|
||
|
24599,
|
||
|
24562,
|
||
|
22976,
|
||
|
24464,
|
||
|
22964,
|
||
|
24581,
|
||
|
],
|
||
|
)
|
||
|
|
||
|
combined_df = pd.DataFrame()
|
||
|
for zip in zip_codes:
|
||
|
sold_df = get_property_details(zip, "sold")
|
||
|
pending_df = get_property_details(zip, "pending")
|
||
|
combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True)
|
||
|
output_to_excel(zip, sold_df, pending_df)
|
||
|
|
||
|
combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx")
|
||
|
combined_df.to_excel(combined_file, index=False)
|