enh: property type (#102)

This commit is contained in:
Cullen Watson
2024-11-03 17:23:07 -06:00
committed by GitHub
parent 1f717bd9e3
commit 8e04f6b117
11 changed files with 274 additions and 241 deletions

View File

@@ -1,141 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "cb48903e-5021-49fe-9688-45cd0bc05d0f",
"metadata": {
"is_executing": true
},
"outputs": [],
"source": [
"from homeharvest import scrape_property\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "156488ce-0d5f-43c5-87f4-c33e9c427860",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None) # Show all columns\n",
"pd.set_option('display.max_rows', None) # Show all rows\n",
"pd.set_option('display.width', None) # Auto-adjust display width to fit console\n",
"pd.set_option('display.max_colwidth', 50) # Limit max column width to 50 characters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c8b9744-8606-4e9b-8add-b90371a249a7",
"metadata": {},
"outputs": [],
"source": [
"# check for sale properties\n",
"scrape_property(\n",
" location=\"dallas\",\n",
" listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aaf86093",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# search a specific address\n",
"scrape_property(\n",
" location=\"2530 Al Lipscomb Way\",\n",
" listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab7b4c21-da1d-4713-9df4-d7425d8ce21e",
"metadata": {},
"outputs": [],
"source": [
"# check rentals\n",
"scrape_property(\n",
" location=\"chicago, illinois\",\n",
" listing_type=\"for_rent\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af280cd3",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# check sold properties\n",
"properties = scrape_property(\n",
" location=\"90210\",\n",
" listing_type=\"sold\",\n",
" past_days=10\n",
")\n",
"display(properties)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "628c1ce2",
"metadata": {
"collapsed": false,
"is_executing": true,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# display clickable URLs\n",
"from IPython.display import display, HTML\n",
"properties['property_url'] = '<a href=\"' + properties['property_url'] + '\" target=\"_blank\">' + properties['property_url'] + '</a>'\n",
"\n",
"html = properties.to_html(escape=False)\n",
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
"display(HTML(truncate_width))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,20 +0,0 @@
from homeharvest import scrape_property
from datetime import datetime
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
# pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
# mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
)
print(f"Number of properties: {len(properties)}")
# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())

104
examples/price_of_land.py Normal file
View File

@@ -0,0 +1,104 @@
"""
This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files.
It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code.
"""
import os
import pandas as pd
from homeharvest import scrape_property
def get_property_details(zip: str, listing_type):
properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365)
if not properties.empty:
properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None)
properties = properties[properties["sqft"].isnull()]
properties["ppa"] = properties.apply(
lambda row: (
int(
(
row["sold_price"]
if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD")
else row["list_price"]
)
/ row["lot_acres"]
)
if pd.notnull(row["lot_acres"])
and row["lot_acres"] > 0
and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"]))
else None
),
axis=1,
)
properties["ppa"] = properties["ppa"].astype("Int64")
selected_columns = [
"property_url",
"property_id",
"style",
"status",
"street",
"city",
"state",
"zip_code",
"county",
"list_date",
"last_sold_date",
"list_price",
"sold_price",
"lot_sqft",
"lot_acres",
"ppa",
]
properties = properties[selected_columns]
return properties
def output_to_excel(zip_code, sold_df, pending_df):
root_folder = os.getcwd()
zip_folder = os.path.join(root_folder, "zips", zip_code)
# Create zip code folder if it doesn't exist
os.makedirs(zip_folder, exist_ok=True)
# Define file paths
sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx")
pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx")
# Save individual sold and pending files
sold_df.to_excel(sold_file, index=False)
pending_df.to_excel(pending_file, index=False)
zip_codes = map(
str,
[
22920,
77024,
78028,
24553,
22967,
22971,
22922,
22958,
22969,
22949,
22938,
24599,
24562,
22976,
24464,
22964,
24581,
],
)
combined_df = pd.DataFrame()
for zip in zip_codes:
sold_df = get_property_details(zip, "sold")
pending_df = get_property_details(zip, "pending")
combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True)
output_to_excel(zip, sold_df, pending_df)
combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx")
combined_df.to_excel(combined_file, index=False)