From c4870677c27c754f77229ef52c920ecb962cdc98 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 4 Oct 2023 10:11:53 -0500 Subject: [PATCH] [enh]: make last_x_days generic add mls_only make radius generic --- README.md | 95 ++++++++------ examples/HomeHarvest_Demo.ipynb | 5 +- examples/HomeHarvest_Demo.py | 18 +++ homeharvest/__init__.py | 108 ++++------------ homeharvest/cli.py | 36 +++++- homeharvest/core/scrapers/__init__.py | 14 +- homeharvest/core/scrapers/realtor/__init__.py | 120 ++++++++++-------- homeharvest/utils.py | 11 +- tests/test_realtor.py | 14 +- 9 files changed, 220 insertions(+), 201 deletions(-) create mode 100644 examples/HomeHarvest_Demo.py diff --git a/README.md b/README.md index f2ccc80..f8b2525 100644 --- a/README.md +++ b/README.md @@ -36,13 +36,13 @@ pip install homeharvest ### CLI ``` -usage: homeharvest [-h] [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] location - +usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] location + Home Harvest Property Scraper - + positional arguments: location Location to scrape (e.g., San Francisco, CA) - + options: -l {for_sale,for_rent,sold}, --listing_type {for_sale,for_rent,sold} Listing type to scrape @@ -54,7 +54,8 @@ options: Proxy to use for scraping -d DAYS, --days DAYS Sold in last _ days filter. -r RADIUS, --radius RADIUS - Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses. + Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses. + -m, --mls_only If set, fetches only MLS listings. ``` ```bash > homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest @@ -73,9 +74,14 @@ filename = f"output/{current_timestamp}.csv" properties = scrape_property( location="San Diego, CA", listing_type="sold", # for_sale, for_rent + last_x_days=30, # sold/listed in last 30 days + mls_only=True, # only fetch MLS listings ) print(f"Number of properties: {len(properties)}") + +# Export to csv properties.to_csv(filename, index=False) +print(properties.head()) ``` @@ -94,12 +100,23 @@ properties.to_csv(filename, index=False) ### Parameters for `scrape_property()` ``` Required -├── location (str): address in various formats e.g. just zip, full address, city/state, etc. -└── listing_type (enum): for_rent, for_sale, sold +├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc. +└── listing_type (option): Choose the type of listing. + - 'for_rent' + - 'for_sale' + - 'sold' + Optional -├── radius_for_comps (float): Radius in miles to find comparable properties based on individual addresses. -├── sold_last_x_days (int): Number of past days to filter sold properties. -├── proxy (str): in format 'http://user:pass@host:port' +├── radius (decimal): Radius in miles to find comparable properties based on individual addresses. +│ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored) +│ +├── last_x_days (integer): Number of past days to filter properties. Utilizes 'COEDate' for 'sold' listing types, and 'Lst Date' for others (for_rent, for_sale). +│ Example: 30 (fetches properties listed/sold in the last 30 days) +│ +├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) +│ +└── proxy (string): In format 'http://user:pass@host:port' + ``` ### Property Schema ```plaintext @@ -111,51 +128,49 @@ Property │ └── status (str) ├── Address Details: -│ ├── street (str) -│ ├── unit (str) -│ ├── city (str) -│ ├── state (str) -│ └── zip (str) +│ ├── street +│ ├── unit +│ ├── city +│ ├── state +│ └── zip ├── Property Description: -│ ├── style (str) -│ ├── beds (int) -│ ├── baths_full (int) -│ ├── baths_half (int) -│ ├── sqft (int) -│ ├── lot_sqft (int) -│ ├── sold_price (int) -│ ├── year_built (int) -│ ├── garage (float) -│ └── stories (int) +│ ├── style +│ ├── beds +│ ├── baths_full +│ ├── baths_half +│ ├── sqft +│ ├── lot_sqft +│ ├── sold_price +│ ├── year_built +│ ├── garage +│ └── stories ├── Property Listing Details: -│ ├── list_price (int) -│ ├── list_date (str) -│ ├── last_sold_date (str) -│ ├── prc_sqft (int) -│ └── hoa_fee (int) +│ ├── list_price +│ ├── list_date +│ ├── last_sold_date +│ ├── prc_sqft +│ └── hoa_fee ├── Location Details: -│ ├── latitude (float) -│ ├── longitude (float) -│ └── neighborhoods (str) +│ ├── latitude +│ ├── longitude +│ └── neighborhoods ``` -## Supported Countries for Property Scraping - -* **Realtor.com**: mainly from the **US** but also has international listings ### Exceptions The following exceptions may be raised when using HomeHarvest: - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` -- `NoResultsFound` - no properties found from your input - +- `NoResultsFound` - no properties found from your search + + ## Frequently Asked Questions --- **Q: Encountering issues with your searches?** -**A:** Try to broaden the location. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues). +**A:** Try to broaden the parameters you're using. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues). --- @@ -163,7 +178,7 @@ The following exceptions may be raised when using HomeHarvest: **A:** This indicates that you have been blocked by Realtor.com for sending too many requests. We recommend: - Waiting a few seconds between requests. -- Trying a VPN to change your IP address. +- Trying a VPN or useing a proxy as a parameter to scrape_property() to change your IP address. --- diff --git a/examples/HomeHarvest_Demo.ipynb b/examples/HomeHarvest_Demo.ipynb index fb9106b..43a28be 100644 --- a/examples/HomeHarvest_Demo.ipynb +++ b/examples/HomeHarvest_Demo.ipynb @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "# scrapes all 3 sites by default\n", + "# check for sale properties\n", "scrape_property(\n", " location=\"dallas\",\n", " listing_type=\"for_sale\"\n", @@ -53,7 +53,6 @@ "# search a specific address\n", "scrape_property(\n", " location=\"2530 Al Lipscomb Way\",\n", - " site_name=\"zillow\",\n", " listing_type=\"for_sale\"\n", ")" ] @@ -68,7 +67,6 @@ "# check rentals\n", "scrape_property(\n", " location=\"chicago, illinois\",\n", - " site_name=[\"redfin\", \"zillow\"],\n", " listing_type=\"for_rent\"\n", ")" ] @@ -88,7 +86,6 @@ "# check sold properties\n", "scrape_property(\n", " location=\"90210\",\n", - " site_name=[\"redfin\"],\n", " listing_type=\"sold\"\n", ")" ] diff --git a/examples/HomeHarvest_Demo.py b/examples/HomeHarvest_Demo.py new file mode 100644 index 0000000..9e8e053 --- /dev/null +++ b/examples/HomeHarvest_Demo.py @@ -0,0 +1,18 @@ +from homeharvest import scrape_property +from datetime import datetime + +# Generate filename based on current timestamp +current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") +filename = f"output/{current_timestamp}.csv" + +properties = scrape_property( + location="San Diego, CA", + listing_type="sold", # for_sale, for_rent + last_x_days=30, # sold/listed in last 30 days + mls_only=True, # only fetch MLS listings +) +print(f"Number of properties: {len(properties)}") + +# Export to csv +properties.to_csv(filename, index=False) +print(properties.head()) \ No newline at end of file diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 0e6ce6d..5d68d1d 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,103 +1,41 @@ +import warnings import pandas as pd -import concurrent.futures -from concurrent.futures import ThreadPoolExecutor - from .core.scrapers import ScraperInput -from .utils import process_result, ordered_properties +from .utils import process_result, ordered_properties, validate_input from .core.scrapers.realtor import RealtorScraper -from .core.scrapers.models import ListingType, Property, SiteName -from .exceptions import InvalidListingType - - -_scrapers = { - "realtor.com": RealtorScraper, -} - - -def _validate_input(listing_type: str) -> None: - if listing_type.upper() not in ListingType.__members__: - raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.") - - -def _scrape_single_site(location: str, site_name: str, listing_type: str, radius: float, proxy: str = None, sold_last_x_days: int = None) -> pd.DataFrame: - """ - Helper function to scrape a single site. - """ - _validate_input(listing_type) - - scraper_input = ScraperInput( - location=location, - listing_type=ListingType[listing_type.upper()], - site_name=SiteName.get_by_value(site_name.lower()), - proxy=proxy, - radius=radius, - sold_last_x_days=sold_last_x_days - ) - - site = _scrapers[site_name.lower()](scraper_input) - results = site.search() - print(f"found {len(results)}") - - properties_dfs = [process_result(result) for result in results] - if not properties_dfs: - return pd.DataFrame() - - return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties] +from .core.scrapers.models import ListingType +from .exceptions import InvalidListingType, NoResultsFound def scrape_property( location: str, listing_type: str = "for_sale", radius: float = None, - sold_last_x_days: int = None, + mls_only: bool = False, + last_x_days: int = None, proxy: str = None, ) -> pd.DataFrame: """ Scrape properties from Realtor.com based on a given location and listing type. - - :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') - :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold'). Default is 'for_sale'. - :param radius: Radius in miles to find comparable properties on individual addresses. Optional. - :param sold_last_x_days: Number of past days to filter sold properties. Optional. - :param proxy: Proxy IP address to be used for scraping. Optional. - :returns: pd.DataFrame containing properties """ - site_name = "realtor.com" + validate_input(listing_type) - if site_name is None: - site_name = list(_scrapers.keys()) + scraper_input = ScraperInput( + location=location, + listing_type=ListingType[listing_type.upper()], + proxy=proxy, + radius=radius, + mls_only=mls_only, + last_x_days=last_x_days, + ) - if not isinstance(site_name, list): - site_name = [site_name] + site = RealtorScraper(scraper_input) + results = site.search() - results = [] + properties_dfs = [process_result(result) for result in results] + if not properties_dfs: + raise NoResultsFound("no results found for the query") - if len(site_name) == 1: - final_df = _scrape_single_site(location, site_name[0], listing_type, radius, proxy, sold_last_x_days) - results.append(final_df) - else: - with ThreadPoolExecutor() as executor: - futures = { - executor.submit(_scrape_single_site, location, s_name, listing_type, radius, proxy, sold_last_x_days): s_name - for s_name in site_name - } - - for future in concurrent.futures.as_completed(futures): - result = future.result() - results.append(result) - - results = [df for df in results if not df.empty and not df.isna().all().all()] - - if not results: - return pd.DataFrame() - - final_df = pd.concat(results, ignore_index=True) - - columns_to_track = ["Street", "Unit", "Zip"] - - #: validate they exist, otherwise create them - for col in columns_to_track: - if col not in final_df.columns: - final_df[col] = None - - return final_df + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties] diff --git a/homeharvest/cli.py b/homeharvest/cli.py index 65850b4..b732486 100644 --- a/homeharvest/cli.py +++ b/homeharvest/cli.py @@ -5,7 +5,9 @@ from homeharvest import scrape_property def main(): parser = argparse.ArgumentParser(description="Home Harvest Property Scraper") - parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)") + parser.add_argument( + "location", type=str, help="Location to scrape (e.g., San Francisco, CA)" + ) parser.add_argument( "-l", @@ -33,21 +35,41 @@ def main(): help="Name of the output file (without extension)", ) - parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping") - parser.add_argument("-d", "--days", type=int, default=None, help="Sold in last _ days filter.") + parser.add_argument( + "-p", "--proxy", type=str, default=None, help="Proxy to use for scraping" + ) + parser.add_argument( + "-d", + "--days", + type=int, + default=None, + help="Sold/listed in last _ days filter.", + ) parser.add_argument( "-r", - "--sold-properties-radius", - dest="sold_properties_radius", # This makes sure the parsed argument is stored as radius_for_comps in args + "--radius", type=float, default=None, - help="Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses." + help="Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.", + ) + parser.add_argument( + "-m", + "--mls_only", + action="store_true", + help="If set, fetches only MLS listings.", ) args = parser.parse_args() - result = scrape_property(args.location, args.listing_type, radius_for_comps=args.radius_for_comps, proxy=args.proxy) + result = scrape_property( + args.location, + args.listing_type, + radius=args.radius, + proxy=args.proxy, + mls_only=args.mls_only, + last_x_days=args.days, + ) if not args.filename: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index bc418e3..1ce4431 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -8,14 +8,18 @@ from .models import Property, ListingType, SiteName class ScraperInput: location: str listing_type: ListingType - site_name: SiteName radius: float | None = None + mls_only: bool | None = None proxy: str | None = None - sold_last_x_days: int | None = None + last_x_days: int | None = None class Scraper: - def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_client.Session = None): + def __init__( + self, + scraper_input: ScraperInput, + session: requests.Session | tls_client.Session = None, + ): self.location = scraper_input.location self.listing_type = scraper_input.listing_type @@ -30,9 +34,9 @@ class Scraper: self.session.proxies.update(proxies) self.listing_type = scraper_input.listing_type - self.site_name = scraper_input.site_name self.radius = scraper_input.radius - self.sold_last_x_days = scraper_input.sold_last_x_days + self.last_x_days = scraper_input.last_x_days + self.mls_only = scraper_input.mls_only def search(self) -> list[Property]: ... diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 629a653..de1dcc4 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -106,12 +106,16 @@ class RealtorScraper(Scraper): Property( mls_id=property_id, property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}", - address=self._parse_address(property_info, search_type="handle_address"), - description=self._parse_description(property_info) + address=self._parse_address( + property_info, search_type="handle_address" + ), + description=self._parse_description(property_info), ) ] - def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]: + def general_search( + self, variables: dict, search_type: str + ) -> Dict[str, Union[int, list[Property]]]: """ Handles a location area & returns a list of properties """ @@ -169,17 +173,23 @@ class RealtorScraper(Scraper): } }""" - sold_date_param = ('sold_date: { min: "$today-%sD" }' % self.sold_last_x_days - if self.listing_type == ListingType.SOLD and self.sold_last_x_days - else "") - sort_param = ('sort: [{ field: sold_date, direction: desc }]' - if self.listing_type == ListingType.SOLD - else 'sort: [{ field: list_date, direction: desc }]') + date_param = ( + 'sold_date: { min: "$today-%sD" }' % self.last_x_days + if self.listing_type == ListingType.SOLD and self.last_x_days + else ( + 'list_date: { min: "$today-%sD" }' % self.last_x_days + if self.last_x_days + else "" + ) + ) + sort_param = ( + "sort: [{ field: sold_date, direction: desc }]" + if self.listing_type == ListingType.SOLD + else "sort: [{ field: list_date, direction: desc }]" + ) if search_type == "comps": - print('general - comps') - query = ( - """query Property_search( + query = """query Property_search( $coordinates: [Float]! $radius: String! $offset: Int!, @@ -197,16 +207,13 @@ class RealtorScraper(Scraper): limit: 200 offset: $offset ) %s""" % ( - self.listing_type.value.lower(), - sold_date_param, - sort_param, - results_query - ) + self.listing_type.value.lower(), + date_param, + sort_param, + results_query, ) else: - print('general - not comps') - query = ( - """query Home_search( + query = """query Home_search( $city: String, $county: [String], $state_code: String, @@ -225,13 +232,11 @@ class RealtorScraper(Scraper): %s limit: 200 offset: $offset - ) %s""" - % ( - self.listing_type.value.lower(), - sold_date_param, - sort_param, - results_query - ) + ) %s""" % ( + self.listing_type.value.lower(), + date_param, + sort_param, + results_query, ) payload = { @@ -247,12 +252,12 @@ class RealtorScraper(Scraper): properties: list[Property] = [] if ( - response_json is None - or "data" not in response_json - or response_json["data"] is None - or search_key not in response_json["data"] - or response_json["data"][search_key] is None - or "results" not in response_json["data"][search_key] + response_json is None + or "data" not in response_json + or response_json["data"] is None + or search_key not in response_json["data"] + or response_json["data"][search_key] is None + or "results" not in response_json["data"][search_key] ): return {"total": 0, "properties": []} @@ -264,32 +269,44 @@ class RealtorScraper(Scraper): else None ) - if not mls: + if not mls and self.mls_only: continue - able_to_get_lat_long = result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") + able_to_get_lat_long = ( + result + and result.get("location") + and result["location"].get("address") + and result["location"]["address"].get("coordinate") + ) realty_property = Property( mls=mls, - mls_id=result["source"].get("listing_id") if "source" in result and isinstance(result["source"], dict) else None, + mls_id=result["source"].get("listing_id") + if "source" in result and isinstance(result["source"], dict) + else None, property_url=f"{self.PROPERTY_URL}{result['property_id']}", status=result["status"].upper(), list_price=result["list_price"], - list_date=result["list_date"].split("T")[0] if result.get("list_date") else None, + list_date=result["list_date"].split("T")[0] + if result.get("list_date") + else None, prc_sqft=result.get("price_per_sqft"), last_sold_date=result.get("last_sold_date"), - hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None, - latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None, - longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None, + hoa_fee=result["hoa"]["fee"] + if result.get("hoa") and isinstance(result["hoa"], dict) + else None, + latitude=result["location"]["address"]["coordinate"].get("lat") + if able_to_get_lat_long + else None, + longitude=result["location"]["address"]["coordinate"].get("lon") + if able_to_get_lat_long + else None, address=self._parse_address(result, search_type="general_search"), neighborhoods=self._parse_neighborhoods(result), - description=self._parse_description(result) + description=self._parse_description(result), ) properties.append(realty_property) - - # print(response_json["data"]["property_search"], variables["offset"]) - # print(response_json["data"]["home_search"]["total"], variables["offset"]) return { "total": response_json["data"][search_key]["total"], "properties": properties, @@ -304,14 +321,13 @@ class RealtorScraper(Scraper): } search_type = "comps" if self.radius and location_type == "address" else "area" - print(search_type) if location_type == "address": - if not self.radius: #: single address search, non comps + if not self.radius: #: single address search, non comps property_id = location_info["mpr_id"] search_variables |= {"property_id": property_id} return self.handle_address(property_id) - else: #: general search, comps (radius) + else: #: general search, comps (radius) coordinates = list(location_info["centroid"].values()) search_variables |= { "coordinates": coordinates, @@ -370,10 +386,10 @@ class RealtorScraper(Scraper): ) return Address( street=f"{result['address']['street_number']} {result['address']['street_name']} {result['address']['street_suffix']}", - unit=result['address']['unit'], - city=result['address']['city'], - state=result['address']['state_code'], - zip=result['address']['postal_code'], + unit=result["address"]["unit"], + city=result["address"]["city"], + state=result["address"]["state_code"], + zip=result["address"]["postal_code"], ) @staticmethod @@ -390,4 +406,4 @@ class RealtorScraper(Scraper): year_built=description_data.get("year_built"), garage=description_data.get("garage"), stories=description_data.get("stories"), - ) \ No newline at end of file + ) diff --git a/homeharvest/utils.py b/homeharvest/utils.py index 58fbce0..1f7f717 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -1,4 +1,4 @@ -from .core.scrapers.models import Property +from .core.scrapers.models import Property, ListingType import pandas as pd ordered_properties = [ @@ -73,4 +73,11 @@ def process_result(result: Property) -> pd.DataFrame: properties_df = pd.DataFrame([prop_data]) properties_df = properties_df.reindex(columns=ordered_properties) - return properties_df[ordered_properties] \ No newline at end of file + return properties_df[ordered_properties] + + +def validate_input(listing_type: str) -> None: + if listing_type.upper() not in ListingType.__members__: + raise InvalidListingType( + f"Provided listing type, '{listing_type}', does not exist." + ) diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 1c06848..15b7e09 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -9,10 +9,10 @@ from homeharvest.exceptions import ( def test_realtor_comps(): result = scrape_property( - location="2530 Al Lipscomb Way", - radius=0.5, - sold_last_x_days=180, - listing_type="sold", + location="2530 Al Lipscomb Way", + radius=0.5, + sold_last_x_days=180, + listing_type="sold", ) assert result is not None and len(result) > 0 @@ -27,7 +27,9 @@ def test_realtor_last_x_days_sold(): location="Dallas, TX", listing_type="sold", sold_last_x_days=10 ) - assert all([result is not None for result in [days_result_30, days_result_10]]) and len(days_result_30) != len(days_result_10) + assert all( + [result is not None for result in [days_result_30, days_result_10]] + ) and len(days_result_30) != len(days_result_10) def test_realtor_single_property(): @@ -39,7 +41,7 @@ def test_realtor_single_property(): scrape_property( location="2530 Al Lipscomb Way", listing_type="for_sale", - ) + ), ] assert all([result is not None for result in results])