From 5d6875d0238e016b696eb36d705ffac6531b5d93 Mon Sep 17 00:00:00 2001 From: yatesquality Date: Fri, 16 May 2025 12:56:00 +0000 Subject: [PATCH] Pending changes exported from your codespace --- .vscode/settings.json | 3 + examples/fetch_all_states.py | 176 +++++++++++++++++++++++++++++++++++ fetch_all_states.log | 10 ++ yates | 176 +++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 examples/fetch_all_states.py create mode 100644 fetch_all_states.log create mode 100644 yates diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b85abae --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "workbench.editorLargeFileConfirmation": 100 +} \ No newline at end of file diff --git a/examples/fetch_all_states.py b/examples/fetch_all_states.py new file mode 100644 index 0000000..0309207 --- /dev/null +++ b/examples/fetch_all_states.py @@ -0,0 +1,176 @@ +import os +import sys +import logging +from datetime import datetime, timedelta +from homeharvest import scrape_property +import pandas as pd +from multiprocessing import Pool, cpu_count +import argparse + +# List of all US states (abbreviations) +US_STATES = [ + 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', + 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', + 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', + 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', + 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY' +] + +LISTING_TYPES = ['sold', 'for_sale', 'for_rent', 'pending'] +DAYS_PER_CHUNK = 30 +START_DATE = datetime(2024, 1, 1) +END_DATE = datetime.now() +OUTPUT_DIR = 'state_exports' +os.makedirs(OUTPUT_DIR, exist_ok=True) + +logging.basicConfig( + filename='fetch_all_states.log', + level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s', +) + +def parse_args(): + parser = argparse.ArgumentParser(description="Fetch real estate data for US states using HomeHarvest.") + parser.add_argument('--states', nargs='+', default=US_STATES, help='List of state abbreviations to fetch (default: all states)') + parser.add_argument('--listing_types', nargs='+', default=LISTING_TYPES, help='Listing types to fetch (default: all types)') + parser.add_argument('--start_date', type=str, default='2024-01-01', help='Start date (YYYY-MM-DD)') + parser.add_argument('--end_date', type=str, default=datetime.now().strftime('%Y-%m-%d'), help='End date (YYYY-MM-DD)') + parser.add_argument('--output_dir', type=str, default='state_exports', help='Output directory') + parser.add_argument('--max_rows', type=int, default=10000, help='Max properties per state') + parser.add_argument('--output_format', choices=['csv', 'excel'], default='csv', help='Output file format') + parser.add_argument('--processes', type=int, default=1, help='Number of parallel processes') + parser.add_argument('--overwrite', action='store_true', help='Overwrite existing files') + return parser.parse_args() + +def fetch_state_10000(state, listing_type): + """ + Fetch up to 10,000 properties for a state and listing type, and return as DataFrame. + """ + try: + properties = scrape_property( + location=state, + listing_type=listing_type, + date_from=START_DATE.strftime('%Y-%m-%d'), + date_to=END_DATE.strftime('%Y-%m-%d'), + limit=10000, + extra_property_data=True # Fetch extra details for each property + ) + if len(properties) > 0: + return properties + else: + logging.info(f"No properties found for {state} {listing_type}") + return None + except Exception as e: + logging.error(f"Error fetching {state} {listing_type}: {e}") + return None + +def fetch_state_custom(state, listing_types, start_date, end_date, max_rows): + all_chunks = [] + for listing_type in listing_types: + try: + properties = scrape_property( + location=state, + listing_type=listing_type, + date_from=start_date, + date_to=end_date, + limit=max_rows, + extra_property_data=True + ) + if len(properties) > 0: + all_chunks.append(properties) + except Exception as e: + logging.error(f"Error fetching {state} {listing_type}: {e}") + if all_chunks: + df_state = pd.concat(all_chunks, ignore_index=True) + return df_state.head(max_rows) + return None + +def process_state_cli(args_tuple): + state, listing_types, start_date, end_date, max_rows, output_dir, output_format, overwrite, columns_map = args_tuple + filename = os.path.join(output_dir, f"{state}_properties.{ 'xlsx' if output_format == 'excel' else 'csv' }") + if os.path.exists(filename) and not overwrite: + logging.info(f"Skipping {filename}, already exists.") + print(f"Skipping {filename}, already exists.") + return + df_state = fetch_state_custom(state, listing_types, start_date, end_date, max_rows) + if df_state is not None: + available_cols = [col for col in columns_map.keys() if col in df_state.columns] + df_export = df_state[available_cols].rename(columns={k: v for k, v in columns_map.items() if k in available_cols}) + if output_format == 'excel': + df_export.to_excel(filename, index=False) + else: + df_export.to_csv(filename, index=False) + logging.info(f"Saved {len(df_export)} properties for {state} to {filename}") + print(f"Saved {len(df_export)} properties for {state} to {filename}") + else: + logging.info(f"No data for {state}") + print(f"No data for {state}") + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + columns_map = { + # 1. Price + 'list_price': 'Listing Price', + # 2. Images + 'photos': 'Images', + 'virtual_tour_url': 'Virtual Tour', + # 3. Property Details + 'beds': 'Bedrooms', + 'full_baths': 'Full Baths', + 'half_baths': 'Half Baths', + 'sqft': 'Square Footage', + 'year_built': 'Year Built', + 'lot_sqft': 'Lot Size', + 'stories': 'Stories', + # 4. Pricing Metrics + 'price_per_sqft': 'Price per Sqft', + 'estimated_value': 'Estimated Market Value', + 'price_history': 'Price History', + # 5. Financial & Fees + 'hoa_fee': 'Monthly HOA Fee', + 'monthly_cost': 'Monthly Cost Calculator', + 'estimated_monthly_payment': 'Estimated Monthly Payments', + # 6. Property Description & Highlights + 'description': "What's Special About This Property", + 'features': 'Features & Upgrades', + 'special_features': 'Unique Selling Points', + # 7. Multimedia & Virtual Tours + 'tour_3d_url': '3D Tour', + 'video_tour_url': 'Video Tour', + # 8. Source & Listing Data + 'agent_name': 'Listed By', + 'mls_id': 'MLS Number', + 'mls': 'Originating MLS', + # 9. Legal & Tax Information + 'tax_history': 'Public Tax History', + 'tax': 'Tax Assessed Value', + # 10. Facts & Features + 'property_type': 'Property Type', + 'interior_features': 'Interior Features', + 'exterior_features': 'Exterior Features', + # 11. Historical Data + 'sale_history': 'Sale History', + # 12. Environmental & Climate Risk + 'climate_risk': 'Climate Risk', + # 13. Getting Around + 'commute': 'Getting Around', + # 14. Nearby Amenities & Education + 'nearby_schools': 'Nearby Schools', + 'nearby_cities': 'Nearby Cities', + 'parks': 'Parks & Recreation', + } + tasks = [ + (state, args.listing_types, args.start_date, args.end_date, args.max_rows, args.output_dir, args.output_format, args.overwrite, columns_map) + for state in args.states + ] + if args.processes > 1: + with Pool(args.processes) as pool: + pool.map(process_state_cli, tasks) + else: + for t in tasks: + process_state_cli(t) + print("Done. See fetch_all_states.log for details.") + +if __name__ == "__main__": + main() diff --git a/fetch_all_states.log b/fetch_all_states.log new file mode 100644 index 0000000..db20708 --- /dev/null +++ b/fetch_all_states.log @@ -0,0 +1,10 @@ +2025-05-16 01:24:29,234 INFO Saved 7208 properties for AL to my_exports/AL_properties.csv +2025-05-16 01:25:51,624 INFO Saved 1292 properties for AK to my_exports/AK_properties.csv +2025-05-16 01:26:24,989 INFO Saved 1054 properties for AZ to my_exports/AZ_properties.csv +2025-05-16 01:28:40,063 INFO Saved 10000 properties for IL to my_exports/IL_properties.csv +2025-05-16 01:30:22,485 INFO Saved 10000 properties for CA to my_exports/CA_properties.csv +2025-05-16 01:30:29,345 INFO Saved 10000 properties for FL to my_exports/FL_properties.csv +2025-05-16 01:31:13,234 INFO Saved 2457 properties for GA to my_exports/GA_properties.csv +2025-05-16 01:50:20,146 INFO Saved 10000 properties for AL to state_exports/AL_properties.csv +2025-05-16 01:51:28,787 INFO Saved 6951 properties for AK to state_exports/AK_properties.csv +2025-05-16 01:51:57,593 INFO Saved 1757 properties for AZ to state_exports/AZ_properties.csv diff --git a/yates b/yates new file mode 100644 index 0000000..0309207 --- /dev/null +++ b/yates @@ -0,0 +1,176 @@ +import os +import sys +import logging +from datetime import datetime, timedelta +from homeharvest import scrape_property +import pandas as pd +from multiprocessing import Pool, cpu_count +import argparse + +# List of all US states (abbreviations) +US_STATES = [ + 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', + 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', + 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', + 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', + 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY' +] + +LISTING_TYPES = ['sold', 'for_sale', 'for_rent', 'pending'] +DAYS_PER_CHUNK = 30 +START_DATE = datetime(2024, 1, 1) +END_DATE = datetime.now() +OUTPUT_DIR = 'state_exports' +os.makedirs(OUTPUT_DIR, exist_ok=True) + +logging.basicConfig( + filename='fetch_all_states.log', + level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s', +) + +def parse_args(): + parser = argparse.ArgumentParser(description="Fetch real estate data for US states using HomeHarvest.") + parser.add_argument('--states', nargs='+', default=US_STATES, help='List of state abbreviations to fetch (default: all states)') + parser.add_argument('--listing_types', nargs='+', default=LISTING_TYPES, help='Listing types to fetch (default: all types)') + parser.add_argument('--start_date', type=str, default='2024-01-01', help='Start date (YYYY-MM-DD)') + parser.add_argument('--end_date', type=str, default=datetime.now().strftime('%Y-%m-%d'), help='End date (YYYY-MM-DD)') + parser.add_argument('--output_dir', type=str, default='state_exports', help='Output directory') + parser.add_argument('--max_rows', type=int, default=10000, help='Max properties per state') + parser.add_argument('--output_format', choices=['csv', 'excel'], default='csv', help='Output file format') + parser.add_argument('--processes', type=int, default=1, help='Number of parallel processes') + parser.add_argument('--overwrite', action='store_true', help='Overwrite existing files') + return parser.parse_args() + +def fetch_state_10000(state, listing_type): + """ + Fetch up to 10,000 properties for a state and listing type, and return as DataFrame. + """ + try: + properties = scrape_property( + location=state, + listing_type=listing_type, + date_from=START_DATE.strftime('%Y-%m-%d'), + date_to=END_DATE.strftime('%Y-%m-%d'), + limit=10000, + extra_property_data=True # Fetch extra details for each property + ) + if len(properties) > 0: + return properties + else: + logging.info(f"No properties found for {state} {listing_type}") + return None + except Exception as e: + logging.error(f"Error fetching {state} {listing_type}: {e}") + return None + +def fetch_state_custom(state, listing_types, start_date, end_date, max_rows): + all_chunks = [] + for listing_type in listing_types: + try: + properties = scrape_property( + location=state, + listing_type=listing_type, + date_from=start_date, + date_to=end_date, + limit=max_rows, + extra_property_data=True + ) + if len(properties) > 0: + all_chunks.append(properties) + except Exception as e: + logging.error(f"Error fetching {state} {listing_type}: {e}") + if all_chunks: + df_state = pd.concat(all_chunks, ignore_index=True) + return df_state.head(max_rows) + return None + +def process_state_cli(args_tuple): + state, listing_types, start_date, end_date, max_rows, output_dir, output_format, overwrite, columns_map = args_tuple + filename = os.path.join(output_dir, f"{state}_properties.{ 'xlsx' if output_format == 'excel' else 'csv' }") + if os.path.exists(filename) and not overwrite: + logging.info(f"Skipping {filename}, already exists.") + print(f"Skipping {filename}, already exists.") + return + df_state = fetch_state_custom(state, listing_types, start_date, end_date, max_rows) + if df_state is not None: + available_cols = [col for col in columns_map.keys() if col in df_state.columns] + df_export = df_state[available_cols].rename(columns={k: v for k, v in columns_map.items() if k in available_cols}) + if output_format == 'excel': + df_export.to_excel(filename, index=False) + else: + df_export.to_csv(filename, index=False) + logging.info(f"Saved {len(df_export)} properties for {state} to {filename}") + print(f"Saved {len(df_export)} properties for {state} to {filename}") + else: + logging.info(f"No data for {state}") + print(f"No data for {state}") + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + columns_map = { + # 1. Price + 'list_price': 'Listing Price', + # 2. Images + 'photos': 'Images', + 'virtual_tour_url': 'Virtual Tour', + # 3. Property Details + 'beds': 'Bedrooms', + 'full_baths': 'Full Baths', + 'half_baths': 'Half Baths', + 'sqft': 'Square Footage', + 'year_built': 'Year Built', + 'lot_sqft': 'Lot Size', + 'stories': 'Stories', + # 4. Pricing Metrics + 'price_per_sqft': 'Price per Sqft', + 'estimated_value': 'Estimated Market Value', + 'price_history': 'Price History', + # 5. Financial & Fees + 'hoa_fee': 'Monthly HOA Fee', + 'monthly_cost': 'Monthly Cost Calculator', + 'estimated_monthly_payment': 'Estimated Monthly Payments', + # 6. Property Description & Highlights + 'description': "What's Special About This Property", + 'features': 'Features & Upgrades', + 'special_features': 'Unique Selling Points', + # 7. Multimedia & Virtual Tours + 'tour_3d_url': '3D Tour', + 'video_tour_url': 'Video Tour', + # 8. Source & Listing Data + 'agent_name': 'Listed By', + 'mls_id': 'MLS Number', + 'mls': 'Originating MLS', + # 9. Legal & Tax Information + 'tax_history': 'Public Tax History', + 'tax': 'Tax Assessed Value', + # 10. Facts & Features + 'property_type': 'Property Type', + 'interior_features': 'Interior Features', + 'exterior_features': 'Exterior Features', + # 11. Historical Data + 'sale_history': 'Sale History', + # 12. Environmental & Climate Risk + 'climate_risk': 'Climate Risk', + # 13. Getting Around + 'commute': 'Getting Around', + # 14. Nearby Amenities & Education + 'nearby_schools': 'Nearby Schools', + 'nearby_cities': 'Nearby Cities', + 'parks': 'Parks & Recreation', + } + tasks = [ + (state, args.listing_types, args.start_date, args.end_date, args.max_rows, args.output_dir, args.output_format, args.overwrite, columns_map) + for state in args.states + ] + if args.processes > 1: + with Pool(args.processes) as pool: + pool.map(process_state_cli, tasks) + else: + for t in tasks: + process_state_cli(t) + print("Done. See fetch_all_states.log for details.") + +if __name__ == "__main__": + main()