Merge pull request #14 from ZacharyHampton/keep_duplicates_flag

Keep duplicates flag
2026-03-04 19:44:29 -08:00 · 2023-09-20 20:27:08 -07:00
parent e9ddc6df92 e82eeaa59f
commit 4cef926d7d
4 changed files with 14 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ By default:
 - The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
 - If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
 - If `-p` or `--proxy` is not provided, the scraper uses the local IP.
+- Use `-k` or `--keep_duplicates` to keep duplicate properties based on address. If not provided, duplicates will be removed.
 ### Python 

 ```py
@@ -73,6 +74,7 @@ Required
 Optional
 ├── site_name (list[enum], default=all three sites): zillow, realtor.com, redfin
 ├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
+└── keep_duplicates (bool, default=False): whether to keep or remove duplicate properties based on address
 ```

 ### Property Schema
--- a/homeharvest/init.py
+++ b/homeharvest/init.py
@@ -119,6 +119,7 @@ def scrape_property(
    site_name: Union[str, list[str]] = None,
    listing_type: str = "for_sale",
    proxy: str = None,
+    keep_duplicates: bool = False
 ) -> pd.DataFrame:
    """
    Scrape property from various sites from a given location and listing type.
@@ -165,5 +166,6 @@ def scrape_property(
        if col not in final_df.columns:
            final_df[col] = None

-    final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
+    if not keep_duplicates:
+        final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
    return final_df
--- a/homeharvest/cli.py
+++ b/homeharvest/cli.py
@@ -42,11 +42,18 @@ def main():
        help="Name of the output file (without extension)",
    )

+    parser.add_argument(
+        "-k",
+        "--keep_duplicates",
+        action="store_true",
+        help="Keep duplicate properties based on address"
+    )
+
    parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")

    args = parser.parse_args()

-    result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy)
+    result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates)

    if not args.filename:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "homeharvest"
-version = "0.2.7"
+version = "0.2.8"
 description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
 authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
 homepage = "https://github.com/ZacharyHampton/HomeHarvest"