commit
4cef926d7d
|
@ -37,6 +37,7 @@ By default:
|
||||||
- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
|
- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
|
||||||
- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
|
- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
|
||||||
- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
|
- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
|
||||||
|
- Use `-k` or `--keep_duplicates` to keep duplicate properties based on address. If not provided, duplicates will be removed.
|
||||||
### Python
|
### Python
|
||||||
|
|
||||||
```py
|
```py
|
||||||
|
@ -73,6 +74,7 @@ Required
|
||||||
Optional
|
Optional
|
||||||
├── site_name (list[enum], default=all three sites): zillow, realtor.com, redfin
|
├── site_name (list[enum], default=all three sites): zillow, realtor.com, redfin
|
||||||
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||||
|
└── keep_duplicates (bool, default=False): whether to keep or remove duplicate properties based on address
|
||||||
```
|
```
|
||||||
|
|
||||||
### Property Schema
|
### Property Schema
|
||||||
|
|
|
@ -119,6 +119,7 @@ def scrape_property(
|
||||||
site_name: Union[str, list[str]] = None,
|
site_name: Union[str, list[str]] = None,
|
||||||
listing_type: str = "for_sale",
|
listing_type: str = "for_sale",
|
||||||
proxy: str = None,
|
proxy: str = None,
|
||||||
|
keep_duplicates: bool = False
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Scrape property from various sites from a given location and listing type.
|
Scrape property from various sites from a given location and listing type.
|
||||||
|
@ -165,5 +166,6 @@ def scrape_property(
|
||||||
if col not in final_df.columns:
|
if col not in final_df.columns:
|
||||||
final_df[col] = None
|
final_df[col] = None
|
||||||
|
|
||||||
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
|
if not keep_duplicates:
|
||||||
|
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
|
||||||
return final_df
|
return final_df
|
||||||
|
|
|
@ -42,11 +42,18 @@ def main():
|
||||||
help="Name of the output file (without extension)",
|
help="Name of the output file (without extension)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-k",
|
||||||
|
"--keep_duplicates",
|
||||||
|
action="store_true",
|
||||||
|
help="Keep duplicate properties based on address"
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
|
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy)
|
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates)
|
||||||
|
|
||||||
if not args.filename:
|
if not args.filename:
|
||||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.2.7"
|
version = "0.2.8"
|
||||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
|
|
Loading…
Reference in New Issue