feat: keep duplicates flag

pull/14/head
Cullen Watson 2023-09-20 20:24:18 -05:00
parent e9ddc6df92
commit 644f16b25b
2 changed files with 11 additions and 2 deletions

View File

@ -119,6 +119,7 @@ def scrape_property(
site_name: Union[str, list[str]] = None, site_name: Union[str, list[str]] = None,
listing_type: str = "for_sale", listing_type: str = "for_sale",
proxy: str = None, proxy: str = None,
keep_duplicates: bool = False
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape property from various sites from a given location and listing type. Scrape property from various sites from a given location and listing type.
@ -165,5 +166,6 @@ def scrape_property(
if col not in final_df.columns: if col not in final_df.columns:
final_df[col] = None final_df[col] = None
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first") if not keep_duplicates:
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
return final_df return final_df

View File

@ -42,11 +42,18 @@ def main():
help="Name of the output file (without extension)", help="Name of the output file (without extension)",
) )
parser.add_argument(
"-k",
"--keep_duplicates",
action="store_true",
help="Keep duplicate properties based on address"
)
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping") parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
args = parser.parse_args() args = parser.parse_args()
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy) result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates)
if not args.filename: if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")