diff --git a/README.md b/README.md index e92d929..37f4515 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ By default: - The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`. - If `-f` or `--filename` is left blank, the default is `HomeHarvest_`. - If `-p` or `--proxy` is not provided, the scraper uses the local IP. +- Use `-k` or `--keep_duplicates` to keep duplicate properties based on address. If not provided, duplicates will be removed. ### Python ```py @@ -73,6 +74,7 @@ Required Optional ├── site_name (list[enum], default=all three sites): zillow, realtor.com, redfin ├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] +└── keep_duplicates (bool, default=False): whether to keep or remove duplicate properties based on address ``` ### Property Schema diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index f9169a6..0373e4b 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -119,6 +119,7 @@ def scrape_property( site_name: Union[str, list[str]] = None, listing_type: str = "for_sale", proxy: str = None, + keep_duplicates: bool = False ) -> pd.DataFrame: """ Scrape property from various sites from a given location and listing type. @@ -165,5 +166,6 @@ def scrape_property( if col not in final_df.columns: final_df[col] = None - final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first") + if not keep_duplicates: + final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first") return final_df diff --git a/homeharvest/cli.py b/homeharvest/cli.py index 099873c..c9deae8 100644 --- a/homeharvest/cli.py +++ b/homeharvest/cli.py @@ -42,11 +42,18 @@ def main(): help="Name of the output file (without extension)", ) + parser.add_argument( + "-k", + "--keep_duplicates", + action="store_true", + help="Keep duplicate properties based on address" + ) + parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping") args = parser.parse_args() - result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy) + result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates) if not args.filename: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") diff --git a/pyproject.toml b/pyproject.toml index c439cca..9d014b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.2.7" +version = "0.2.8" description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/ZacharyHampton/HomeHarvest"