From 644f16b25bad58764eea0e978e5e732c11b3cf7d Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 20 Sep 2023 20:24:18 -0500 Subject: [PATCH] feat: keep duplicates flag --- homeharvest/__init__.py | 4 +++- homeharvest/cli.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index f9169a6..0373e4b 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -119,6 +119,7 @@ def scrape_property( site_name: Union[str, list[str]] = None, listing_type: str = "for_sale", proxy: str = None, + keep_duplicates: bool = False ) -> pd.DataFrame: """ Scrape property from various sites from a given location and listing type. @@ -165,5 +166,6 @@ def scrape_property( if col not in final_df.columns: final_df[col] = None - final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first") + if not keep_duplicates: + final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first") return final_df diff --git a/homeharvest/cli.py b/homeharvest/cli.py index 099873c..c9deae8 100644 --- a/homeharvest/cli.py +++ b/homeharvest/cli.py @@ -42,11 +42,18 @@ def main(): help="Name of the output file (without extension)", ) + parser.add_argument( + "-k", + "--keep_duplicates", + action="store_true", + help="Keep duplicate properties based on address" + ) + parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping") args = parser.parse_args() - result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy) + result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates) if not args.filename: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")