diff --git a/README.md b/README.md index 2e6e2b4..9888de7 100644 --- a/README.md +++ b/README.md @@ -26,18 +26,19 @@ pip install --force-reinstall homeharvest ### CLI ```bash -homeharvest "San Francisco, CA" --site_name zillow realtor.com redfin --listing_type for_rent --output excel --filename HomeHarvest +homeharvest "San Francisco, CA" -s zillow realtor.com redfin -l for_rent -o excel -f HomeHarvest ``` This will scrape properties from the specified sites for the given location and listing type, and save the results to an Excel file named `HomeHarvest.xlsx`. By default: -- If `--site_name` is not provided, it will scrape from all available sites. -- If `--listing_type` is left blank, the default is `for_sale`, other options are `for_rent` or `sold`. -- The `--output` default format is `excel`, options are `csv` or `excel`. -- If `--filename` is left blank, the default is `HomeHarvest_` - +- If `-s` or `--site_name` is not provided, it will scrape from all available sites. +- If `-l` or `--listing_type` is left blank, the default is `for_sale`. Other options are `for_rent` or `sold`. +- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`. +- If `-f` or `--filename` is left blank, the default is `HomeHarvest_`. +- If `-p` or `--proxy` is not provided, the scraper uses the local IP. ### Python + ```py from homeharvest import scrape_property import pandas as pd @@ -71,6 +72,7 @@ Required └── listing_type (enum): for_rent, for_sale, sold Optional ├── site_name (List[enum], default=all three sites): zillow, realtor.com, redfin +├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] ``` ### Property Schema diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 05cbea3..9c6d9c8 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -18,7 +18,7 @@ _scrapers = { } -def validate_input(site_name: str, listing_type: str) -> None: +def _validate_input(site_name: str, listing_type: str) -> None: if site_name.lower() not in _scrapers: raise InvalidSite(f"Provided site, '{site_name}', does not exist.") @@ -28,7 +28,7 @@ def validate_input(site_name: str, listing_type: str) -> None: ) -def get_ordered_properties(result: Property) -> list[str]: +def _get_ordered_properties(result: Property) -> list[str]: return [ "property_url", "site_name", @@ -75,7 +75,7 @@ def get_ordered_properties(result: Property) -> list[str]: ] -def process_result(result: Property) -> pd.DataFrame: +def _process_result(result: Property) -> pd.DataFrame: prop_data = result.__dict__ prop_data["site_name"] = prop_data["site_name"].value @@ -96,29 +96,30 @@ def process_result(result: Property) -> pd.DataFrame: del prop_data["address"] properties_df = pd.DataFrame([prop_data]) - properties_df = properties_df[get_ordered_properties(result)] + properties_df = properties_df[_get_ordered_properties(result)] return properties_df def _scrape_single_site( - location: str, site_name: str, listing_type: str + location: str, site_name: str, listing_type: str, proxy: str = None ) -> pd.DataFrame: """ Helper function to scrape a single site. """ - validate_input(site_name, listing_type) + _validate_input(site_name, listing_type) scraper_input = ScraperInput( location=location, listing_type=ListingType[listing_type.upper()], site_name=SiteName.get_by_value(site_name.lower()), + proxy=proxy, ) site = _scrapers[site_name.lower()](scraper_input) results = site.search() - properties_dfs = [process_result(result) for result in results] + properties_dfs = [_process_result(result) for result in results] properties_dfs = [ df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty ] @@ -132,6 +133,7 @@ def scrape_property( location: str, site_name: Union[str, list[str]] = None, listing_type: str = "for_sale", + proxy: str = None, ) -> pd.DataFrame: """ Scrape property from various sites from a given location and listing type. @@ -151,13 +153,13 @@ def scrape_property( results = [] if len(site_name) == 1: - final_df = _scrape_single_site(location, site_name[0], listing_type) + final_df = _scrape_single_site(location, site_name[0], listing_type, proxy) results.append(final_df) else: with ThreadPoolExecutor() as executor: futures = { executor.submit( - _scrape_single_site, location, s_name, listing_type + _scrape_single_site, location, s_name, listing_type, proxy ): s_name for s_name in site_name } diff --git a/homeharvest/cli.py b/homeharvest/cli.py index a056dd3..df237cf 100644 --- a/homeharvest/cli.py +++ b/homeharvest/cli.py @@ -8,36 +8,51 @@ def main(): parser.add_argument( "location", type=str, help="Location to scrape (e.g., San Francisco, CA)" ) + parser.add_argument( + "-s", "--site_name", type=str, nargs="*", default=None, - help="Site name(s) to scrape from (e.g., realtor.com zillow)", + help="Site name(s) to scrape from (e.g., realtor, zillow)", ) + parser.add_argument( + "-l", "--listing_type", type=str, default="for_sale", choices=["for_sale", "for_rent", "sold"], help="Listing type to scrape", ) + parser.add_argument( + "-o", "--output", type=str, default="excel", choices=["excel", "csv"], help="Output format", ) + parser.add_argument( + "-f", "--filename", type=str, default=None, help="Name of the output file (without extension)", ) + parser.add_argument( + "-p", "--proxy", type=str, default=None, help="Proxy to use for scraping" + ) + args = parser.parse_args() - result = scrape_property(args.location, args.site_name, args.listing_type) + + result = scrape_property( + args.location, args.site_name, args.listing_type, proxy=args.proxy + ) if not args.filename: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 3e2c25a..5e2da68 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -8,7 +8,7 @@ class ScraperInput: location: str listing_type: ListingType site_name: SiteName - proxy_url: str | None = None + proxy: str | None = None class Scraper: @@ -20,11 +20,9 @@ class Scraper: self.listing_type = scraper_input.listing_type self.site_name = scraper_input.site_name - if scraper_input.proxy_url: - self.session.proxies = { - "http": scraper_input.proxy_url, - "https": scraper_input.proxy_url, - } + self.proxy = (lambda p: {"http": p, "https": p} if p else None)( + scraper_input.proxy + ) def search(self) -> list[Property]: ... diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index ccddf71..f8a63bf 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -39,6 +39,7 @@ class RealtorScraper(Scraper): "https://parser-external.geo.moveaws.com/suggest", params=params, headers=headers, + proxies=self.proxy, ) response_json = response.json() @@ -104,7 +105,7 @@ class RealtorScraper(Scraper): "variables": variables, } - response = self.session.post(self.search_url, json=payload) + response = self.session.post(self.search_url, json=payload, proxies=self.proxy) response_json = response.json() property_info = response_json["data"]["property"] @@ -217,7 +218,7 @@ class RealtorScraper(Scraper): "variables": variables, } - response = self.session.post(self.search_url, json=payload) + response = self.session.post(self.search_url, json=payload, proxies=self.proxy) response.raise_for_status() response_json = response.json() diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index e2fb0d8..57ec57d 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -16,7 +16,7 @@ class RedfinScraper(Scraper): self.location ) - response = self.session.get(url) + response = self.session.get(url, proxies=self.proxy) response_json = json.loads(response.text.replace("{}&&", "")) def get_region_type(match_type: str): @@ -111,7 +111,7 @@ class RedfinScraper(Scraper): def _handle_rentals(self, region_id, region_type): url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true®ion_id={region_id}®ion_type={region_type}&num_homes=100000" - response = self.session.get(url) + response = self.session.get(url, proxies=self.proxy) response.raise_for_status() homes = response.json() @@ -211,7 +211,7 @@ class RedfinScraper(Scraper): home_id ) - response = self.session.get(url) + response = self.session.get(url, proxies=self.proxy) response_json = json.loads(response.text.replace("{}&&", "")) parsed_home = self._parse_home( @@ -233,7 +233,7 @@ class RedfinScraper(Scraper): url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&num_homes=100000" else: url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000" - response = self.session.get(url) + response = self.session.get(url, proxies=self.proxy) response_json = json.loads(response.text.replace("{}&&", "")) homes = [ self._parse_home(home) for home in response_json["payload"]["homes"] diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 43571f6..8a9e135 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -1,6 +1,5 @@ import re import json -import string from .. import Scraper from ....utils import parse_address_two, parse_unit from ....exceptions import GeoCoordsNotFound, NoResultsFound @@ -27,12 +26,14 @@ class ZillowScraper(Scraper): "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render" ).format(location) - response = self.session.get(url) + response = self.session.get(url, proxies=self.proxy) return response.json()["results"] != [] def search(self): - resp = self.session.get(self.url, headers=self._get_headers()) + resp = self.session.get( + self.url, headers=self._get_headers(), proxies=self.proxy + ) resp.raise_for_status() content = resp.text @@ -129,7 +130,9 @@ class ZillowScraper(Scraper): "wants": {"cat1": ["mapResults"]}, "isDebugRequest": False, } - resp = self.session.put(url, headers=self._get_headers(), json=payload) + resp = self.session.put( + url, headers=self._get_headers(), json=payload, proxies=self.proxy + ) resp.raise_for_status() a = resp.json() return self._parse_properties(resp.json())