diff --git a/README.md b/README.md index 096643b..5feaa5f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -**HomeHarvest** is a simple, yet comprehensive, real estate scraping library that extracts and formats data in the style of MLS listings. +**HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings. **Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com). @@ -13,8 +13,6 @@ - **Export Flexibility**: Options to save as either CSV or Excel. - **Usage Modes**: - **Python**: For those who'd like to integrate scraping into their Python scripts. - - **CLI**: For users who prefer command-line operations. - [Video Guide for HomeHarvest](https://youtu.be/J1qgNPgmSLI) - _updated for release v0.3.4_ @@ -46,9 +44,9 @@ properties = scrape_property( # date_from="2023-05-01", # alternative to past_days # date_to="2023-05-28", + # foreclosure=True # mls_only=True, # only fetch MLS listings - # proxy="http://user:pass@host:port" # use a proxy to change your IP address ) print(f"Number of properties: {len(properties)}") @@ -57,7 +55,6 @@ properties.to_csv(filename, index=False) print(properties.head()) ``` - ## Output ```plaintext >>> properties.head() @@ -94,37 +91,9 @@ Optional │ ├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) │ +├── foreclosure (True/False): If set, fetches only foreclosures +│ └── proxy (string): In format 'http://user:pass@host:port' - - -``` - -### CLI - -``` -usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location - -Home Harvest Property Scraper - -positional arguments: - location Location to scrape (e.g., San Francisco, CA) - -options: - -l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending} - Listing type to scrape - -o {excel,csv}, --output {excel,csv} - Output format - -f FILENAME, --filename FILENAME - Name of the output file (without extension) - -p PROXY, --proxy PROXY - Proxy to use for scraping - -d DAYS, --days DAYS Sold/listed in last _ days filter. - -r RADIUS, --radius RADIUS - Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses. - -m, --mls_only If set, fetches only MLS listings. -``` -```bash -homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest ``` ### Property Schema @@ -175,21 +144,4 @@ The following exceptions may be raised when using HomeHarvest: - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD - - -## Frequently Asked Questions ---- - -**Q: Encountering issues with your searches?** -**A:** Try to broaden the parameters you're using. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues). - ---- - -**Q: Received a Forbidden 403 response code?** -**A:** This indicates that you have been blocked by Realtor.com for sending too many requests. We recommend: - -- Waiting a few seconds between requests. -- Trying a VPN or using a proxy as a parameter to scrape_property() to change your IP address. - ---- - + \ No newline at end of file diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 44107eb..a0973ba 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -15,6 +15,7 @@ def scrape_property( proxy: str = None, date_from: str = None, date_to: str = None, + foreclosure: bool = None, ) -> pd.DataFrame: """ Scrape properties from Realtor.com based on a given location and listing type. @@ -38,6 +39,7 @@ def scrape_property( last_x_days=past_days, date_from=date_from, date_to=date_to, + foreclosure=foreclosure, ) site = RealtorScraper(scraper_input) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 29dc08f..a0bd471 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -13,6 +13,7 @@ class ScraperInput: last_x_days: int | None = None date_from: str | None = None date_to: str | None = None + foreclosure: bool | None = None class Scraper: @@ -40,6 +41,7 @@ class Scraper: self.mls_only = scraper_input.mls_only self.date_from = scraper_input.date_from self.date_to = scraper_input.date_to + self.foreclosure = scraper_input.foreclosure def search(self) -> list[Property]: ... diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 090a12f..530828e 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -381,9 +381,16 @@ class RealtorScraper(Scraper): if self.listing_type == ListingType.PENDING else "" ) - + listing_type = ListingType.FOR_SALE if self.listing_type == ListingType.PENDING else self.listing_type - + is_foreclosure = "" + + if 'foreclosure' in variables and variables['foreclosure'] == True: + is_foreclosure = "foreclosure: true" + + if 'foreclosure' in variables and variables['foreclosure'] == False: + is_foreclosure = "foreclosure: false" + if search_type == "comps": #: comps search, came from an address query = """query Property_search( $coordinates: [Float]! @@ -392,6 +399,7 @@ class RealtorScraper(Scraper): ) { home_search( query: { + %s nearby: { coordinates: $coordinates radius: $radius @@ -404,6 +412,7 @@ class RealtorScraper(Scraper): limit: 200 offset: $offset ) %s""" % ( + is_foreclosure, listing_type.value.lower(), date_param, pending_or_contingent_param, @@ -420,6 +429,7 @@ class RealtorScraper(Scraper): ) { home_search( query: { + %s city: $city county: $county postal_code: $postal_code @@ -432,6 +442,7 @@ class RealtorScraper(Scraper): limit: 200 offset: $offset ) %s""" % ( + is_foreclosure, listing_type.value.lower(), date_param, pending_or_contingent_param, @@ -541,7 +552,7 @@ class RealtorScraper(Scraper): search_variables = { "offset": 0, } - + search_type = ( "comps" if self.radius and location_type == "address" @@ -586,6 +597,9 @@ class RealtorScraper(Scraper): "postal_code": location_info.get("postal_code"), } + if self.foreclosure: + search_variables['foreclosure'] = self.foreclosure + result = self.general_search(search_variables, search_type=search_type) total = result["total"] homes = result["properties"] diff --git a/pyproject.toml b/pyproject.toml index c999a8e..71ec289 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [tool.poetry] name = "homeharvest" version = "0.3.12" -description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." -authors = ["Zachary Hampton ", "Cullen Watson "] +description = "Real estate scraping library" +authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest" readme = "README.md" @@ -13,7 +13,6 @@ homeharvest = "homeharvest.cli:main" python = ">=3.10,<3.13" requests = "^2.31.0" pandas = "^2.1.1" -openpyxl = "^3.1.2" [tool.poetry.group.dev.dependencies] diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 018b1b3..43a21da 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -139,3 +139,15 @@ def test_realtor_bad_address(): if len(bad_results) == 0: assert True + +def test_realtor_foreclosed(): + foreclosed = scrape_property( + location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=True + ) + + not_foreclosed = scrape_property( + location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=False + ) + + assert len(foreclosed) != len(not_foreclosed) +