mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 03:54:29 -08:00
Compare commits
57 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a39357a07 | ||
|
|
8f06d46ddb | ||
|
|
0dae14ccfc | ||
|
|
9aaabdd5d8 | ||
|
|
cdf41fe9f2 | ||
|
|
1f0feb836d | ||
|
|
5f31beda46 | ||
|
|
fd9cdea499 | ||
|
|
93a1cbe17f | ||
|
|
49d27943c4 | ||
|
|
05fca9b7e6 | ||
|
|
20ce44fb3a | ||
|
|
52017c1bb5 | ||
|
|
dba1c03081 | ||
|
|
1fc2d8c549 | ||
|
|
02d112eea0 | ||
|
|
30e510882b | ||
|
|
78b56c2cac | ||
|
|
087854a688 | ||
|
|
80586467a8 | ||
|
|
3494b152b8 | ||
|
|
6c6fef80ed | ||
|
|
62e3321277 | ||
|
|
80186ee8c5 | ||
|
|
3ec47c5b6a | ||
|
|
42e8ac4de9 | ||
|
|
e1917009ae | ||
|
|
7297f0eb33 | ||
|
|
2eec389838 | ||
|
|
b01162161d | ||
|
|
906ce92685 | ||
|
|
cc76e067b2 | ||
|
|
1f0c351974 | ||
|
|
a1684f87db | ||
|
|
2ae3ebe28e | ||
|
|
ae3961514b | ||
|
|
0621b01d9a | ||
|
|
fbbd56d930 | ||
|
|
82092faa28 | ||
|
|
8f90a80b0a | ||
|
|
d5b4d80f96 | ||
|
|
086bcfd224 | ||
|
|
4726764482 | ||
|
|
ca260fd2b4 | ||
|
|
94e5b090da | ||
|
|
d0a6a66b6a | ||
|
|
8e140a0e45 | ||
|
|
588689c230 | ||
|
|
c7a4bfd5e4 | ||
|
|
fe351ab57c | ||
|
|
5d0f519a85 | ||
|
|
869d7e7c51 | ||
|
|
ffd3ce6aed | ||
|
|
471e53118e | ||
|
|
dc8c15959f | ||
|
|
10c01f373e | ||
|
|
fd01bfb8b8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@
|
|||||||
**/.pytest_cache/
|
**/.pytest_cache/
|
||||||
*.pyc
|
*.pyc
|
||||||
/.ipynb_checkpoints/
|
/.ipynb_checkpoints/
|
||||||
|
*.csv
|
||||||
@@ -31,8 +31,30 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# scrapes all 3 sites by default\n",
|
||||||
"scrape_property(\n",
|
"scrape_property(\n",
|
||||||
" location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n",
|
" location=\"dallas\",\n",
|
||||||
|
" listing_type=\"for_sale\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "aaf86093",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# search a specific address\n",
|
||||||
|
"scrape_property(\n",
|
||||||
|
" location=\"2530 Al Lipscomb Way\",\n",
|
||||||
|
" site_name=\"zillow\",\n",
|
||||||
|
" listing_type=\"for_sale\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -43,8 +65,31 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# check rentals\n",
|
||||||
"scrape_property(\n",
|
"scrape_property(\n",
|
||||||
" location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n",
|
" location=\"chicago, illinois\",\n",
|
||||||
|
" site_name=[\"redfin\", \"zillow\"],\n",
|
||||||
|
" listing_type=\"for_rent\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "af280cd3",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# check sold properties\n",
|
||||||
|
"scrape_property(\n",
|
||||||
|
" location=\"90210\",\n",
|
||||||
|
" site_name=[\"redfin\"],\n",
|
||||||
|
" listing_type=\"sold\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
164
README.md
164
README.md
@@ -1,33 +1,165 @@
|
|||||||
# HomeHarvest
|
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
|
||||||
|
|
||||||
**HomeHarvest** aims to be the top Python real estate scraping library.
|
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
|
||||||
|
|
||||||
_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._
|
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
||||||
|
|
||||||
|
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
||||||
|
- Aggregates the properties in a Pandas DataFrame
|
||||||
|
|
||||||
|
[Video Guide for HomeHarvest](https://www.youtube.com/watch?v=HCoHoiJdWQY)
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install --upgrade homeharvest
|
pip install --force-reinstall homeharvest
|
||||||
|
```
|
||||||
|
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### CLI
|
||||||
|
|
||||||
|
```bash
|
||||||
|
homeharvest "San Francisco, CA" -s zillow realtor.com redfin -l for_rent -o excel -f HomeHarvest
|
||||||
```
|
```
|
||||||
|
|
||||||
## Example Usage
|
This will scrape properties from the specified sites for the given location and listing type, and save the results to an Excel file named `HomeHarvest.xlsx`.
|
||||||
```
|
|
||||||
|
By default:
|
||||||
|
- If `-s` or `--site_name` is not provided, it will scrape from all available sites.
|
||||||
|
- If `-l` or `--listing_type` is left blank, the default is `for_sale`. Other options are `for_rent` or `sold`.
|
||||||
|
- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
|
||||||
|
- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
|
||||||
|
- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
|
||||||
|
### Python
|
||||||
|
|
||||||
|
```py
|
||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
properties = scrape_property(
|
properties: pd.DataFrame = scrape_property(
|
||||||
location="85281", site_name="zillow", listing_type="for_rent"
|
site_name=["zillow", "realtor.com", "redfin"],
|
||||||
|
location="85281",
|
||||||
|
listing_type="for_rent" # for_sale / sold
|
||||||
)
|
)
|
||||||
|
|
||||||
|
#: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel().
|
||||||
print(properties)
|
print(properties)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Site Name Options
|
## Output
|
||||||
|
```py
|
||||||
|
>>> properties.head()
|
||||||
|
property_url site_name listing_type apt_min_price apt_max_price ...
|
||||||
|
0 https://www.redfin.com/AZ/Tempe/1003-W-Washing... redfin for_rent 1666.0 2750.0 ...
|
||||||
|
1 https://www.redfin.com/AZ/Tempe/VELA-at-Town-L... redfin for_rent 1665.0 3763.0 ...
|
||||||
|
2 https://www.redfin.com/AZ/Tempe/Camden-Tempe/a... redfin for_rent 1939.0 3109.0 ...
|
||||||
|
3 https://www.redfin.com/AZ/Tempe/Emerson-Park/a... redfin for_rent 1185.0 1817.0 ...
|
||||||
|
4 https://www.redfin.com/AZ/Tempe/Rio-Paradiso-A... redfin for_rent 1470.0 2235.0 ...
|
||||||
|
[5 rows x 41 columns]
|
||||||
|
```
|
||||||
|
|
||||||
- `zillow`
|
### Parameters for `scrape_properties()`
|
||||||
- `redfin`
|
```plaintext
|
||||||
- `realtor.com`
|
Required
|
||||||
|
├── location (str): address in various formats e.g. just zip, full address, city/state, etc.
|
||||||
|
└── listing_type (enum): for_rent, for_sale, sold
|
||||||
|
Optional
|
||||||
|
├── site_name (List[enum], default=all three sites): zillow, realtor.com, redfin
|
||||||
|
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||||
|
```
|
||||||
|
|
||||||
### Listing Types
|
### Property Schema
|
||||||
|
```plaintext
|
||||||
|
Property
|
||||||
|
├── Basic Information:
|
||||||
|
│ ├── property_url (str)
|
||||||
|
│ ├── site_name (enum): zillow, redfin, realtor.com
|
||||||
|
│ ├── listing_type (enum: ListingType)
|
||||||
|
│ └── property_type (enum): house, apartment, condo, townhouse, single_family, multi_family, building
|
||||||
|
|
||||||
|
├── Address Details:
|
||||||
|
│ ├── street_address (str)
|
||||||
|
│ ├── city (str)
|
||||||
|
│ ├── state (str)
|
||||||
|
│ ├── zip_code (str)
|
||||||
|
│ ├── unit (str)
|
||||||
|
│ └── country (str)
|
||||||
|
|
||||||
|
├── Property Features:
|
||||||
|
│ ├── price (int)
|
||||||
|
│ ├── tax_assessed_value (int)
|
||||||
|
│ ├── currency (str)
|
||||||
|
│ ├── square_feet (int)
|
||||||
|
│ ├── beds (int)
|
||||||
|
│ ├── baths (float)
|
||||||
|
│ ├── lot_area_value (float)
|
||||||
|
│ ├── lot_area_unit (str)
|
||||||
|
│ ├── stories (int)
|
||||||
|
│ └── year_built (int)
|
||||||
|
|
||||||
|
├── Miscellaneous Details:
|
||||||
|
│ ├── price_per_sqft (int)
|
||||||
|
│ ├── mls_id (str)
|
||||||
|
│ ├── agent_name (str)
|
||||||
|
│ ├── img_src (str)
|
||||||
|
│ ├── description (str)
|
||||||
|
│ ├── status_text (str)
|
||||||
|
│ ├── latitude (float)
|
||||||
|
│ ├── longitude (float)
|
||||||
|
│ └── posted_time (str) [Only for Zillow]
|
||||||
|
|
||||||
|
├── Building Details (for property_type: building):
|
||||||
|
│ ├── bldg_name (str)
|
||||||
|
│ ├── bldg_unit_count (int)
|
||||||
|
│ ├── bldg_min_beds (int)
|
||||||
|
│ ├── bldg_min_baths (float)
|
||||||
|
│ └── bldg_min_area (int)
|
||||||
|
|
||||||
|
└── Apartment Details (for property type: apartment):
|
||||||
|
├── apt_min_beds: int
|
||||||
|
├── apt_max_beds: int
|
||||||
|
├── apt_min_baths: float
|
||||||
|
├── apt_max_baths: float
|
||||||
|
├── apt_min_price: int
|
||||||
|
├── apt_max_price: int
|
||||||
|
├── apt_min_sqft: int
|
||||||
|
├── apt_max_sqft: int
|
||||||
|
```
|
||||||
|
## Supported Countries for Property Scraping
|
||||||
|
|
||||||
|
* **Zillow**: contains listings in the **US** & **Canada**
|
||||||
|
* **Realtor.com**: mainly from the **US** but also has international listings
|
||||||
|
* **Redfin**: listings mainly in the **US**, **Canada**, & has expanded to some areas in **Mexico**
|
||||||
|
|
||||||
|
### Exceptions
|
||||||
|
The following exceptions may be raised when using HomeHarvest:
|
||||||
|
|
||||||
|
- `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com`
|
||||||
|
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
|
||||||
|
- `NoResultsFound` - no properties found from your input
|
||||||
|
- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the location you input
|
||||||
|
|
||||||
|
## Frequently Asked Questions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Q: Encountering issues with your queries?**
|
||||||
|
**A:** Try a single site and/or broaden the location. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Q: Received a Forbidden 403 response code?**
|
||||||
|
**A:** This indicates that you have been blocked by the real estate site for sending too many requests. Currently, **Zillow** is particularly aggressive with blocking. We recommend:
|
||||||
|
|
||||||
|
- Waiting a few seconds between requests.
|
||||||
|
- Trying a VPN to change your IP address.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
- `for_rent`
|
|
||||||
- `for_sale`
|
|
||||||
- `sold`
|
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from typing import Union
|
||||||
|
import concurrent.futures
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
from .core.scrapers import ScraperInput
|
||||||
from .core.scrapers.redfin import RedfinScraper
|
from .core.scrapers.redfin import RedfinScraper
|
||||||
from .core.scrapers.realtor import RealtorScraper
|
from .core.scrapers.realtor import RealtorScraper
|
||||||
from .core.scrapers.zillow import ZillowScraper
|
from .core.scrapers.zillow import ZillowScraper
|
||||||
from .core.scrapers.models import ListingType, Property, Building, SiteName
|
from .core.scrapers.models import ListingType, Property, SiteName
|
||||||
from .core.scrapers import ScraperInput
|
|
||||||
from .exceptions import InvalidSite, InvalidListingType
|
from .exceptions import InvalidSite, InvalidListingType
|
||||||
from typing import Union
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
_scrapers = {
|
_scrapers = {
|
||||||
@@ -15,7 +18,7 @@ _scrapers = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def validate_input(site_name: str, listing_type: str) -> None:
|
def _validate_input(site_name: str, listing_type: str) -> None:
|
||||||
if site_name.lower() not in _scrapers:
|
if site_name.lower() not in _scrapers:
|
||||||
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
||||||
|
|
||||||
@@ -25,93 +28,161 @@ def validate_input(site_name: str, listing_type: str) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_ordered_properties(result: Union[Building, Property]) -> list[str]:
|
def _get_ordered_properties(result: Property) -> list[str]:
|
||||||
if isinstance(result, Property):
|
|
||||||
return [
|
return [
|
||||||
|
"property_url",
|
||||||
|
"site_name",
|
||||||
"listing_type",
|
"listing_type",
|
||||||
"address_one",
|
"property_type",
|
||||||
|
"status_text",
|
||||||
|
"currency",
|
||||||
|
"price",
|
||||||
|
"apt_min_price",
|
||||||
|
"apt_max_price",
|
||||||
|
"apt_min_sqft",
|
||||||
|
"apt_max_sqft",
|
||||||
|
"apt_min_beds",
|
||||||
|
"apt_max_beds",
|
||||||
|
"apt_min_baths",
|
||||||
|
"apt_max_baths",
|
||||||
|
"tax_assessed_value",
|
||||||
|
"square_feet",
|
||||||
|
"price_per_sqft",
|
||||||
|
"beds",
|
||||||
|
"baths",
|
||||||
|
"lot_area_value",
|
||||||
|
"lot_area_unit",
|
||||||
|
"street_address",
|
||||||
|
"unit",
|
||||||
"city",
|
"city",
|
||||||
"state",
|
"state",
|
||||||
"zip_code",
|
"zip_code",
|
||||||
"address_two",
|
"country",
|
||||||
"url",
|
"posted_time",
|
||||||
"property_type",
|
"bldg_min_beds",
|
||||||
"price",
|
"bldg_min_baths",
|
||||||
"beds",
|
"bldg_min_area",
|
||||||
"baths",
|
"bldg_unit_count",
|
||||||
"square_feet",
|
"bldg_name",
|
||||||
"price_per_square_foot",
|
|
||||||
"lot_size",
|
|
||||||
"stories",
|
"stories",
|
||||||
"year_built",
|
"year_built",
|
||||||
"agent_name",
|
"agent_name",
|
||||||
"mls_id",
|
"mls_id",
|
||||||
|
"img_src",
|
||||||
|
"latitude",
|
||||||
|
"longitude",
|
||||||
"description",
|
"description",
|
||||||
]
|
]
|
||||||
elif isinstance(result, Building):
|
|
||||||
return [
|
|
||||||
"address_one",
|
|
||||||
"city",
|
|
||||||
"state",
|
|
||||||
"zip_code",
|
|
||||||
"address_two",
|
|
||||||
"url",
|
|
||||||
"num_units",
|
|
||||||
"min_unit_price",
|
|
||||||
"max_unit_price",
|
|
||||||
"avg_unit_price",
|
|
||||||
"listing_type",
|
|
||||||
]
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def process_result(result: Union[Building, Property]) -> pd.DataFrame:
|
def _process_result(result: Property) -> pd.DataFrame:
|
||||||
prop_data = result.__dict__
|
prop_data = result.__dict__
|
||||||
|
|
||||||
|
prop_data["site_name"] = prop_data["site_name"].value
|
||||||
|
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
|
||||||
|
if "property_type" in prop_data and prop_data["property_type"] is not None:
|
||||||
|
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
||||||
|
else:
|
||||||
|
prop_data["property_type"] = None
|
||||||
|
if "address" in prop_data:
|
||||||
address_data = prop_data["address"]
|
address_data = prop_data["address"]
|
||||||
prop_data["site_name"] = prop_data["site_name"]
|
prop_data["street_address"] = address_data.street_address
|
||||||
prop_data["listing_type"] = prop_data["listing_type"].value
|
prop_data["unit"] = address_data.unit
|
||||||
prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data.get("property_type") else None
|
|
||||||
prop_data["address_one"] = address_data.address_one
|
|
||||||
prop_data["city"] = address_data.city
|
prop_data["city"] = address_data.city
|
||||||
prop_data["state"] = address_data.state
|
prop_data["state"] = address_data.state
|
||||||
prop_data["zip_code"] = address_data.zip_code
|
prop_data["zip_code"] = address_data.zip_code
|
||||||
prop_data["address_two"] = address_data.address_two
|
prop_data["country"] = address_data.country
|
||||||
|
|
||||||
del prop_data["address"]
|
del prop_data["address"]
|
||||||
|
|
||||||
properties_df = pd.DataFrame([prop_data])
|
properties_df = pd.DataFrame([prop_data])
|
||||||
properties_df = properties_df[get_ordered_properties(result)]
|
properties_df = properties_df[_get_ordered_properties(result)]
|
||||||
|
|
||||||
return properties_df
|
return properties_df
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_single_site(
|
||||||
|
location: str, site_name: str, listing_type: str, proxy: str = None
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Helper function to scrape a single site.
|
||||||
|
"""
|
||||||
|
_validate_input(site_name, listing_type)
|
||||||
|
|
||||||
|
scraper_input = ScraperInput(
|
||||||
|
location=location,
|
||||||
|
listing_type=ListingType[listing_type.upper()],
|
||||||
|
site_name=SiteName.get_by_value(site_name.lower()),
|
||||||
|
proxy=proxy,
|
||||||
|
)
|
||||||
|
|
||||||
|
site = _scrapers[site_name.lower()](scraper_input)
|
||||||
|
results = site.search()
|
||||||
|
|
||||||
|
properties_dfs = [_process_result(result) for result in results]
|
||||||
|
properties_dfs = [
|
||||||
|
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
|
||||||
|
]
|
||||||
|
if not properties_dfs:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
return pd.concat(properties_dfs, ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
def scrape_property(
|
def scrape_property(
|
||||||
location: str,
|
location: str,
|
||||||
site_name: str,
|
site_name: Union[str, list[str]] = None,
|
||||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
listing_type: str = "for_sale",
|
||||||
|
proxy: str = None,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Scrape property from various sites from a given location and listing type.
|
Scrape property from various sites from a given location and listing type.
|
||||||
|
|
||||||
:returns: pd.DataFrame
|
:returns: pd.DataFrame
|
||||||
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
|
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
|
||||||
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
|
:param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
|
||||||
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
|
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
|
||||||
:return: pd.DataFrame containing properties
|
:return: pd.DataFrame containing properties
|
||||||
"""
|
"""
|
||||||
|
if site_name is None:
|
||||||
|
site_name = list(_scrapers.keys())
|
||||||
|
|
||||||
validate_input(site_name, listing_type)
|
if not isinstance(site_name, list):
|
||||||
|
site_name = [site_name]
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
results = []
|
||||||
location=location,
|
|
||||||
listing_type=ListingType[listing_type.upper()],
|
if len(site_name) == 1:
|
||||||
site_name=site_name.lower(),
|
final_df = _scrape_single_site(location, site_name[0], listing_type, proxy)
|
||||||
|
results.append(final_df)
|
||||||
|
else:
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
futures = {
|
||||||
|
executor.submit(
|
||||||
|
_scrape_single_site, location, s_name, listing_type, proxy
|
||||||
|
): s_name
|
||||||
|
for s_name in site_name
|
||||||
|
}
|
||||||
|
|
||||||
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
result = future.result()
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
results = [df for df in results if not df.empty and not df.isna().all().all()]
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
final_df = pd.concat(results, ignore_index=True)
|
||||||
|
|
||||||
|
columns_to_track = ["street_address", "city", "unit"]
|
||||||
|
|
||||||
|
#: validate they exist, otherwise create them
|
||||||
|
for col in columns_to_track:
|
||||||
|
if col not in final_df.columns:
|
||||||
|
final_df[col] = None
|
||||||
|
|
||||||
|
final_df = final_df.drop_duplicates(
|
||||||
|
subset=["street_address", "city", "unit"], keep="first"
|
||||||
)
|
)
|
||||||
|
return final_df
|
||||||
site = _scrapers[site_name.lower()](scraper_input)
|
|
||||||
results = site.search()
|
|
||||||
|
|
||||||
properties_dfs = [process_result(result) for result in results]
|
|
||||||
|
|
||||||
return pd.concat(properties_dfs, ignore_index=True)
|
|
||||||
|
|||||||
72
homeharvest/cli.py
Normal file
72
homeharvest/cli.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
import argparse
|
||||||
|
import datetime
|
||||||
|
from homeharvest import scrape_property
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
|
||||||
|
parser.add_argument(
|
||||||
|
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--site_name",
|
||||||
|
type=str,
|
||||||
|
nargs="*",
|
||||||
|
default=None,
|
||||||
|
help="Site name(s) to scrape from (e.g., realtor, zillow)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--listing_type",
|
||||||
|
type=str,
|
||||||
|
default="for_sale",
|
||||||
|
choices=["for_sale", "for_rent", "sold"],
|
||||||
|
help="Listing type to scrape",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output",
|
||||||
|
type=str,
|
||||||
|
default="excel",
|
||||||
|
choices=["excel", "csv"],
|
||||||
|
help="Output format",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-f",
|
||||||
|
"--filename",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Name of the output file (without extension)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
result = scrape_property(
|
||||||
|
args.location, args.site_name, args.listing_type, proxy=args.proxy
|
||||||
|
)
|
||||||
|
|
||||||
|
if not args.filename:
|
||||||
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
args.filename = f"HomeHarvest_{timestamp}"
|
||||||
|
|
||||||
|
if args.output == "excel":
|
||||||
|
output_filename = f"{args.filename}.xlsx"
|
||||||
|
result.to_excel(output_filename, index=False)
|
||||||
|
print(f"Excel file saved as {output_filename}")
|
||||||
|
elif args.output == "csv":
|
||||||
|
output_filename = f"{args.filename}.csv"
|
||||||
|
result.to_csv(output_filename, index=False)
|
||||||
|
print(f"CSV file saved as {output_filename}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -7,8 +7,8 @@ from .models import Property, ListingType, SiteName
|
|||||||
class ScraperInput:
|
class ScraperInput:
|
||||||
location: str
|
location: str
|
||||||
listing_type: ListingType
|
listing_type: ListingType
|
||||||
site_name: str
|
site_name: SiteName
|
||||||
proxy_url: str | None = None
|
proxy: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
@@ -17,15 +17,16 @@ class Scraper:
|
|||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
|
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
|
if scraper_input.proxy:
|
||||||
|
proxy_url = scraper_input.proxy
|
||||||
|
proxies = {
|
||||||
|
"http": proxy_url,
|
||||||
|
"https": proxy_url
|
||||||
|
}
|
||||||
|
self.session.proxies.update(proxies)
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
self.site_name = scraper_input.site_name
|
self.site_name = scraper_input.site_name
|
||||||
|
|
||||||
if scraper_input.proxy_url:
|
|
||||||
self.session.proxies = {
|
|
||||||
"http": scraper_input.proxy_url,
|
|
||||||
"https": scraper_input.proxy_url,
|
|
||||||
}
|
|
||||||
|
|
||||||
def search(self) -> list[Property]:
|
def search(self) -> list[Property]:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|||||||
@@ -7,24 +7,37 @@ class SiteName(Enum):
|
|||||||
REDFIN = "redfin"
|
REDFIN = "redfin"
|
||||||
REALTOR = "realtor.com"
|
REALTOR = "realtor.com"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_by_value(cls, value):
|
||||||
|
for item in cls:
|
||||||
|
if item.value == value:
|
||||||
|
return item
|
||||||
|
raise ValueError(f"{value} not found in {cls}")
|
||||||
|
|
||||||
|
|
||||||
class ListingType(Enum):
|
class ListingType(Enum):
|
||||||
FOR_SALE = "for_sale"
|
FOR_SALE = "FOR_SALE"
|
||||||
FOR_RENT = "for_rent"
|
FOR_RENT = "FOR_RENT"
|
||||||
SOLD = "sold"
|
SOLD = "SOLD"
|
||||||
|
|
||||||
|
|
||||||
class PropertyType(Enum):
|
class PropertyType(Enum):
|
||||||
HOUSE = "HOUSE"
|
HOUSE = "HOUSE"
|
||||||
|
BUILDING = "BUILDING"
|
||||||
CONDO = "CONDO"
|
CONDO = "CONDO"
|
||||||
TOWNHOUSE = "TOWNHOUSE"
|
TOWNHOUSE = "TOWNHOUSE"
|
||||||
SINGLE_FAMILY = "SINGLE_FAMILY"
|
SINGLE_FAMILY = "SINGLE_FAMILY"
|
||||||
MULTI_FAMILY = "MULTI_FAMILY"
|
MULTI_FAMILY = "MULTI_FAMILY"
|
||||||
MANUFACTURED = "MANUFACTURED"
|
MANUFACTURED = "MANUFACTURED"
|
||||||
|
NEW_CONSTRUCTION = "NEW_CONSTRUCTION"
|
||||||
APARTMENT = "APARTMENT"
|
APARTMENT = "APARTMENT"
|
||||||
|
APARTMENTS = "APARTMENTS"
|
||||||
LAND = "LAND"
|
LAND = "LAND"
|
||||||
|
LOT = "LOT"
|
||||||
OTHER = "OTHER"
|
OTHER = "OTHER"
|
||||||
|
|
||||||
|
BLANK = "BLANK"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_int_code(cls, code):
|
def from_int_code(cls, code):
|
||||||
mapping = {
|
mapping = {
|
||||||
@@ -38,47 +51,62 @@ class PropertyType(Enum):
|
|||||||
13: cls.SINGLE_FAMILY,
|
13: cls.SINGLE_FAMILY,
|
||||||
}
|
}
|
||||||
|
|
||||||
return mapping.get(code, cls.OTHER)
|
return mapping.get(code, cls.BLANK)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Address:
|
class Address:
|
||||||
address_one: str
|
street_address: str
|
||||||
city: str
|
city: str
|
||||||
state: str
|
state: str
|
||||||
zip_code: str
|
zip_code: str
|
||||||
|
unit: str | None = None
|
||||||
address_two: str | None = None
|
country: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass()
|
|
||||||
class Realty:
|
|
||||||
site_name: str
|
|
||||||
address: Address
|
|
||||||
url: str
|
|
||||||
listing_type: ListingType | None = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Property(Realty):
|
class Property:
|
||||||
|
property_url: str
|
||||||
|
site_name: SiteName
|
||||||
|
listing_type: ListingType
|
||||||
|
address: Address
|
||||||
|
property_type: PropertyType | None = None
|
||||||
|
|
||||||
|
# house for sale
|
||||||
price: int | None = None
|
price: int | None = None
|
||||||
|
tax_assessed_value: int | None = None
|
||||||
|
currency: str | None = None
|
||||||
|
square_feet: int | None = None
|
||||||
beds: int | None = None
|
beds: int | None = None
|
||||||
baths: float | None = None
|
baths: float | None = None
|
||||||
|
lot_area_value: float | None = None
|
||||||
|
lot_area_unit: str | None = None
|
||||||
stories: int | None = None
|
stories: int | None = None
|
||||||
year_built: int | None = None
|
year_built: int | None = None
|
||||||
square_feet: int | None = None
|
price_per_sqft: int | None = None
|
||||||
price_per_square_foot: int | None = None
|
|
||||||
mls_id: str | None = None
|
mls_id: str | None = None
|
||||||
|
|
||||||
agent_name: str | None = None
|
agent_name: str | None = None
|
||||||
property_type: PropertyType | None = None
|
img_src: str | None = None
|
||||||
lot_size: int | None = None
|
|
||||||
description: str | None = None
|
description: str | None = None
|
||||||
|
status_text: str | None = None
|
||||||
|
latitude: float | None = None
|
||||||
|
longitude: float | None = None
|
||||||
|
posted_time: str | None = None
|
||||||
|
|
||||||
|
# building for sale
|
||||||
|
bldg_name: str | None = None
|
||||||
|
bldg_unit_count: int | None = None
|
||||||
|
bldg_min_beds: int | None = None
|
||||||
|
bldg_min_baths: float | None = None
|
||||||
|
bldg_min_area: int | None = None
|
||||||
|
|
||||||
@dataclass
|
# apt
|
||||||
class Building(Realty):
|
apt_min_beds: int | None = None
|
||||||
num_units: int | None = None
|
apt_max_beds: int | None = None
|
||||||
min_unit_price: int | None = None
|
apt_min_baths: float | None = None
|
||||||
max_unit_price: int | None = None
|
apt_max_baths: float | None = None
|
||||||
avg_unit_price: int | None = None
|
apt_min_price: int | None = None
|
||||||
|
apt_max_price: int | None = None
|
||||||
|
apt_min_sqft: int | None = None
|
||||||
|
apt_max_sqft: int | None = None
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from ..models import Property, Address
|
|||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from typing import Any, Generator
|
from typing import Any, Generator
|
||||||
from ....exceptions import NoResultsFound
|
from ....exceptions import NoResultsFound
|
||||||
|
from ....utils import parse_address_two, parse_unit
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
@@ -29,7 +30,7 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
params = {
|
params = {
|
||||||
"input": self.location,
|
"input": self.location,
|
||||||
"client_id": self.listing_type.value.replace('_', '-'),
|
"client_id": self.listing_type.value.lower().replace("_", "-"),
|
||||||
"limit": "1",
|
"limit": "1",
|
||||||
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
||||||
}
|
}
|
||||||
@@ -43,7 +44,7 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
result = response_json["autocomplete"]
|
result = response_json["autocomplete"]
|
||||||
|
|
||||||
if result is None:
|
if not result:
|
||||||
raise NoResultsFound("No results found for location: " + self.location)
|
raise NoResultsFound("No results found for location: " + self.location)
|
||||||
|
|
||||||
return result[0]
|
return result[0]
|
||||||
@@ -96,46 +97,56 @@ class RealtorScraper(Scraper):
|
|||||||
}
|
}
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
variables = {
|
variables = {"property_id": property_id}
|
||||||
'property_id': property_id
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
'query': query,
|
"query": query,
|
||||||
'variables': variables,
|
"variables": variables,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self.session.post(self.search_url, json=payload)
|
response = self.session.post(self.search_url, json=payload)
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
property_info = response_json['data']['property']
|
property_info = response_json["data"]["property"]
|
||||||
|
street_address, unit = parse_address_two(property_info["address"]["line"])
|
||||||
|
|
||||||
return [Property(
|
return [
|
||||||
|
Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
address=Address(
|
address=Address(
|
||||||
address_one=property_info['address']['line'],
|
street_address=street_address,
|
||||||
city=property_info['address']['city'],
|
city=property_info["address"]["city"],
|
||||||
state=property_info['address']['state_code'],
|
state=property_info["address"]["state_code"],
|
||||||
zip_code=property_info['address']['postal_code'],
|
zip_code=property_info["address"]["postal_code"],
|
||||||
|
unit=unit,
|
||||||
|
country="USA",
|
||||||
),
|
),
|
||||||
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
beds=property_info['basic']['beds'],
|
+ property_info["details"]["permalink"],
|
||||||
baths=property_info['basic']['baths'],
|
beds=property_info["basic"]["beds"],
|
||||||
stories=property_info['details']['stories'],
|
baths=property_info["basic"]["baths"],
|
||||||
year_built=property_info['details']['year_built'],
|
stories=property_info["details"]["stories"],
|
||||||
square_feet=property_info['basic']['sqft'],
|
year_built=property_info["details"]["year_built"],
|
||||||
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
|
square_feet=property_info["basic"]["sqft"],
|
||||||
if property_info['basic']['sqft'] is not None and
|
price_per_sqft=property_info["basic"]["price"]
|
||||||
property_info['basic']['price'] is not None
|
// property_info["basic"]["sqft"]
|
||||||
|
if property_info["basic"]["sqft"] is not None
|
||||||
|
and property_info["basic"]["price"] is not None
|
||||||
else None,
|
else None,
|
||||||
price=property_info['basic']['price'],
|
price=property_info["basic"]["price"],
|
||||||
mls_id=property_id,
|
mls_id=property_id,
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
|
lot_area_value=property_info["public_record"]["lot_size"]
|
||||||
)]
|
if property_info["public_record"] is not None
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
|
def handle_area(
|
||||||
query = """query Home_search(
|
self, variables: dict, return_total: bool = False
|
||||||
|
) -> list[Property] | int:
|
||||||
|
query = (
|
||||||
|
"""query Home_search(
|
||||||
$city: String,
|
$city: String,
|
||||||
$county: [String],
|
$county: [String],
|
||||||
$state_code: String,
|
$state_code: String,
|
||||||
@@ -184,6 +195,10 @@ class RealtorScraper(Scraper):
|
|||||||
street_post_direction
|
street_post_direction
|
||||||
street_suffix
|
street_suffix
|
||||||
unit
|
unit
|
||||||
|
coordinate {
|
||||||
|
lon
|
||||||
|
lat
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
list_price
|
list_price
|
||||||
@@ -193,42 +208,74 @@ class RealtorScraper(Scraper):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}""" % self.listing_type.value
|
}"""
|
||||||
|
% self.listing_type.value.lower()
|
||||||
|
)
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
'query': query,
|
"query": query,
|
||||||
'variables': variables,
|
"variables": variables,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self.session.post(self.search_url, json=payload)
|
response = self.session.post(self.search_url, json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
if return_total:
|
if return_total:
|
||||||
return response_json['data']['home_search']['total']
|
return response_json["data"]["home_search"]["total"]
|
||||||
|
|
||||||
properties: list[Property] = []
|
properties: list[Property] = []
|
||||||
|
|
||||||
for result in response_json['data']['home_search']['results']:
|
if (
|
||||||
|
response_json is None
|
||||||
|
or "data" not in response_json
|
||||||
|
or response_json["data"] is None
|
||||||
|
or "home_search" not in response_json["data"]
|
||||||
|
or response_json["data"]["home_search"] is None
|
||||||
|
or "results" not in response_json["data"]["home_search"]
|
||||||
|
):
|
||||||
|
return []
|
||||||
|
|
||||||
|
for result in response_json["data"]["home_search"]["results"]:
|
||||||
|
street_address, unit = parse_address_two(
|
||||||
|
result["location"]["address"]["line"]
|
||||||
|
)
|
||||||
realty_property = Property(
|
realty_property = Property(
|
||||||
address=Address(
|
address=Address(
|
||||||
address_one=result['location']['address']['line'],
|
street_address=street_address,
|
||||||
city=result['location']['address']['city'],
|
city=result["location"]["address"]["city"],
|
||||||
state=result['location']['address']['state_code'],
|
state=result["location"]["address"]["state_code"],
|
||||||
zip_code=result['location']['address']['postal_code'],
|
zip_code=result["location"]["address"]["postal_code"],
|
||||||
address_two=result['location']['address']['unit'],
|
unit=parse_unit(result["location"]["address"]["unit"]),
|
||||||
|
country="USA",
|
||||||
),
|
),
|
||||||
|
latitude=result["location"]["address"]["coordinate"]["lat"]
|
||||||
|
if result
|
||||||
|
and result.get("location")
|
||||||
|
and result["location"].get("address")
|
||||||
|
and result["location"]["address"].get("coordinate")
|
||||||
|
and "lat" in result["location"]["address"]["coordinate"]
|
||||||
|
else None,
|
||||||
|
longitude=result["location"]["address"]["coordinate"]["lon"]
|
||||||
|
if result
|
||||||
|
and result.get("location")
|
||||||
|
and result["location"].get("address")
|
||||||
|
and result["location"]["address"].get("coordinate")
|
||||||
|
and "lon" in result["location"]["address"]["coordinate"]
|
||||||
|
else None,
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
beds=result['description']['beds'],
|
+ result["property_id"],
|
||||||
baths=result['description']['baths'],
|
beds=result["description"]["beds"],
|
||||||
stories=result['description']['stories'],
|
baths=result["description"]["baths"],
|
||||||
year_built=result['description']['year_built'],
|
stories=result["description"]["stories"],
|
||||||
square_feet=result['description']['sqft'],
|
year_built=result["description"]["year_built"],
|
||||||
price_per_square_foot=result['price_per_sqft'],
|
square_feet=result["description"]["sqft"],
|
||||||
price=result['list_price'],
|
price_per_sqft=result["price_per_sqft"],
|
||||||
mls_id=result['property_id'],
|
price=result["list_price"],
|
||||||
|
mls_id=result["property_id"],
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
lot_size=result['description']['lot_sqft'],
|
lot_area_value=result["description"]["lot_sqft"],
|
||||||
)
|
)
|
||||||
|
|
||||||
properties.append(realty_property)
|
properties.append(realty_property)
|
||||||
@@ -239,17 +286,17 @@ class RealtorScraper(Scraper):
|
|||||||
location_info = self.handle_location()
|
location_info = self.handle_location()
|
||||||
location_type = location_info["area_type"]
|
location_type = location_info["area_type"]
|
||||||
|
|
||||||
if location_type == 'address':
|
if location_type == "address":
|
||||||
property_id = location_info['mpr_id']
|
property_id = location_info["mpr_id"]
|
||||||
return self.handle_address(property_id)
|
return self.handle_address(property_id)
|
||||||
|
|
||||||
offset = 0
|
offset = 0
|
||||||
search_variables = {
|
search_variables = {
|
||||||
'city': location_info.get('city'),
|
"city": location_info.get("city"),
|
||||||
'county': location_info.get('county'),
|
"county": location_info.get("county"),
|
||||||
'state_code': location_info.get('state_code'),
|
"state_code": location_info.get("state_code"),
|
||||||
'postal_code': location_info.get('postal_code'),
|
"postal_code": location_info.get("postal_code"),
|
||||||
'offset': offset,
|
"offset": offset,
|
||||||
}
|
}
|
||||||
|
|
||||||
total = self.handle_area(search_variables, return_total=True)
|
total = self.handle_area(search_variables, return_total=True)
|
||||||
@@ -258,8 +305,11 @@ class RealtorScraper(Scraper):
|
|||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
futures = [
|
futures = [
|
||||||
executor.submit(
|
executor.submit(
|
||||||
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
|
self.handle_area,
|
||||||
) for i in range(0, total, 200)
|
variables=search_variables | {"offset": i},
|
||||||
|
return_total=False,
|
||||||
|
)
|
||||||
|
for i in range(0, total, 200)
|
||||||
]
|
]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
import json
|
import json
|
||||||
from ..models import Property, Address, PropertyType, Building
|
|
||||||
from .. import Scraper
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from .. import Scraper
|
||||||
|
from ....utils import parse_address_two, parse_unit
|
||||||
|
from ..models import Property, Address, PropertyType, ListingType, SiteName
|
||||||
|
from ....exceptions import NoResultsFound
|
||||||
|
|
||||||
|
|
||||||
class RedfinScraper(Scraper):
|
class RedfinScraper(Scraper):
|
||||||
@@ -25,6 +27,11 @@ class RedfinScraper(Scraper):
|
|||||||
elif match_type == "1":
|
elif match_type == "1":
|
||||||
return "address" #: address, needs to be handled differently
|
return "address" #: address, needs to be handled differently
|
||||||
|
|
||||||
|
if "exactMatch" not in response_json["payload"]:
|
||||||
|
raise NoResultsFound(
|
||||||
|
"No results found for location: {}".format(self.location)
|
||||||
|
)
|
||||||
|
|
||||||
if response_json["payload"]["exactMatch"] is not None:
|
if response_json["payload"]["exactMatch"] is not None:
|
||||||
target = response_json["payload"]["exactMatch"]
|
target = response_json["payload"]["exactMatch"]
|
||||||
else:
|
else:
|
||||||
@@ -38,24 +45,33 @@ class RedfinScraper(Scraper):
|
|||||||
return home[key]["value"]
|
return home[key]["value"]
|
||||||
|
|
||||||
if not single_search:
|
if not single_search:
|
||||||
|
street_address, unit = parse_address_two(get_value("streetLine"))
|
||||||
|
unit = parse_unit(get_value("streetLine"))
|
||||||
address = Address(
|
address = Address(
|
||||||
address_one=get_value("streetLine"),
|
street_address=street_address,
|
||||||
city=home["city"],
|
city=home["city"],
|
||||||
state=home["state"],
|
state=home["state"],
|
||||||
zip_code=home["zip"],
|
zip_code=home["zip"],
|
||||||
|
unit=unit,
|
||||||
|
country="USA",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
address_info = home["streetAddress"]
|
address_info = home["streetAddress"]
|
||||||
|
street_address, unit = parse_address_two(address_info["assembledAddress"])
|
||||||
|
|
||||||
address = Address(
|
address = Address(
|
||||||
address_one=address_info["assembledAddress"],
|
street_address=street_address,
|
||||||
city=home["city"],
|
city=home["city"],
|
||||||
state=home["state"],
|
state=home["state"],
|
||||||
zip_code=home["zip"],
|
zip_code=home["zip"],
|
||||||
|
unit=unit,
|
||||||
|
country="USA",
|
||||||
)
|
)
|
||||||
|
|
||||||
url = "https://www.redfin.com{}".format(home["url"])
|
url = "https://www.redfin.com{}".format(home["url"])
|
||||||
property_type = home["propertyType"] if "propertyType" in home else None
|
#: property_type = home["propertyType"] if "propertyType" in home else None
|
||||||
lot_size_data = home.get("lotSize")
|
lot_size_data = home.get("lotSize")
|
||||||
|
|
||||||
if not isinstance(lot_size_data, int):
|
if not isinstance(lot_size_data, int):
|
||||||
lot_size = (
|
lot_size = (
|
||||||
lot_size_data.get("value", None)
|
lot_size_data.get("value", None)
|
||||||
@@ -69,7 +85,7 @@ class RedfinScraper(Scraper):
|
|||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
address=address,
|
address=address,
|
||||||
url=url,
|
property_url=url,
|
||||||
beds=home["beds"] if "beds" in home else None,
|
beds=home["beds"] if "beds" in home else None,
|
||||||
baths=home["baths"] if "baths" in home else None,
|
baths=home["baths"] if "baths" in home else None,
|
||||||
stories=home["stories"] if "stories" in home else None,
|
stories=home["stories"] if "stories" in home else None,
|
||||||
@@ -79,40 +95,108 @@ class RedfinScraper(Scraper):
|
|||||||
if not single_search
|
if not single_search
|
||||||
else home["yearBuilt"],
|
else home["yearBuilt"],
|
||||||
square_feet=get_value("sqFt"),
|
square_feet=get_value("sqFt"),
|
||||||
lot_size=lot_size,
|
lot_area_value=lot_size,
|
||||||
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
||||||
price_per_square_foot=get_value("pricePerSqFt"),
|
price_per_sqft=get_value("pricePerSqFt"),
|
||||||
price=get_value("price"),
|
price=get_value("price"),
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
|
latitude=home["latLong"]["latitude"]
|
||||||
|
if "latLong" in home and "latitude" in home["latLong"]
|
||||||
|
else None,
|
||||||
|
longitude=home["latLong"]["longitude"]
|
||||||
|
if "latLong" in home and "longitude" in home["latLong"]
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_building(self, building: dict) -> Building:
|
def _handle_rentals(self, region_id, region_type):
|
||||||
return Building(
|
url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true®ion_id={region_id}®ion_type={region_type}&num_homes=100000"
|
||||||
|
|
||||||
|
response = self.session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
homes = response.json()
|
||||||
|
|
||||||
|
properties_list = []
|
||||||
|
|
||||||
|
for home in homes["homes"]:
|
||||||
|
home_data = home["homeData"]
|
||||||
|
rental_data = home["rentalExtension"]
|
||||||
|
|
||||||
|
property_url = f"https://www.redfin.com{home_data.get('url', '')}"
|
||||||
|
address_info = home_data.get("addressInfo", {})
|
||||||
|
centroid = address_info.get("centroid", {}).get("centroid", {})
|
||||||
address = Address(
|
address = Address(
|
||||||
address_one=" ".join(
|
street_address=address_info.get("formattedStreetLine", None),
|
||||||
[
|
city=address_info.get("city", None),
|
||||||
building['address']['streetNumber'],
|
state=address_info.get("state", None),
|
||||||
building['address']['directionalPrefix'],
|
zip_code=address_info.get("zip", None),
|
||||||
building['address']['streetName'],
|
unit=None,
|
||||||
building['address']['streetType'],
|
country="US" if address_info.get("countryCode", None) == 1 else None,
|
||||||
]
|
|
||||||
),
|
|
||||||
city=building['address']['city'],
|
|
||||||
state=building['address']['stateOrProvinceCode'],
|
|
||||||
zip_code=building['address']['postalCode'],
|
|
||||||
address_two=" ".join(
|
|
||||||
[
|
|
||||||
building['address']['unitType'],
|
|
||||||
building['address']['unitValue'],
|
|
||||||
]
|
|
||||||
)
|
|
||||||
),
|
|
||||||
site_name=self.site_name,
|
|
||||||
url="https://www.redfin.com{}".format(building["url"]),
|
|
||||||
listing_type=self.listing_type,
|
|
||||||
num_units=building["numUnitsForSale"],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
|
||||||
|
bed_range = rental_data.get("bedRange", {"min": None, "max": None})
|
||||||
|
bath_range = rental_data.get("bathRange", {"min": None, "max": None})
|
||||||
|
sqft_range = rental_data.get("sqftRange", {"min": None, "max": None})
|
||||||
|
|
||||||
|
property_ = Property(
|
||||||
|
property_url=property_url,
|
||||||
|
site_name=SiteName.REDFIN,
|
||||||
|
listing_type=ListingType.FOR_RENT,
|
||||||
|
address=address,
|
||||||
|
apt_min_beds=bed_range.get("min", None),
|
||||||
|
apt_min_baths=bath_range.get("min", None),
|
||||||
|
apt_max_beds=bed_range.get("max", None),
|
||||||
|
apt_max_baths=bath_range.get("max", None),
|
||||||
|
description=rental_data.get("description", None),
|
||||||
|
latitude=centroid.get("latitude", None),
|
||||||
|
longitude=centroid.get("longitude", None),
|
||||||
|
apt_min_price=price_range.get("min", None),
|
||||||
|
apt_max_price=price_range.get("max", None),
|
||||||
|
apt_min_sqft=sqft_range.get("min", None),
|
||||||
|
apt_max_sqft=sqft_range.get("max", None),
|
||||||
|
img_src=home_data.get("staticMapUrl", None),
|
||||||
|
posted_time=rental_data.get("lastUpdated", None),
|
||||||
|
bldg_name=rental_data.get("propertyName", None),
|
||||||
|
)
|
||||||
|
|
||||||
|
properties_list.append(property_)
|
||||||
|
|
||||||
|
if not properties_list:
|
||||||
|
raise NoResultsFound("No rentals found for the given location.")
|
||||||
|
|
||||||
|
return properties_list
|
||||||
|
|
||||||
|
def _parse_building(self, building: dict) -> Property:
|
||||||
|
street_address = " ".join(
|
||||||
|
[
|
||||||
|
building["address"]["streetNumber"],
|
||||||
|
building["address"]["directionalPrefix"],
|
||||||
|
building["address"]["streetName"],
|
||||||
|
building["address"]["streetType"],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
street_address, unit = parse_address_two(street_address)
|
||||||
|
return Property(
|
||||||
|
site_name=self.site_name,
|
||||||
|
property_type=PropertyType("BUILDING"),
|
||||||
|
address=Address(
|
||||||
|
street_address=street_address,
|
||||||
|
city=building["address"]["city"],
|
||||||
|
state=building["address"]["stateOrProvinceCode"],
|
||||||
|
zip_code=building["address"]["postalCode"],
|
||||||
|
unit=parse_unit(
|
||||||
|
" ".join(
|
||||||
|
[
|
||||||
|
building["address"]["unitType"],
|
||||||
|
building["address"]["unitValue"],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
),
|
||||||
|
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||||
|
listing_type=self.listing_type,
|
||||||
|
bldg_unit_count=building["numUnitsForSale"],
|
||||||
|
)
|
||||||
|
|
||||||
def handle_address(self, home_id: str):
|
def handle_address(self, home_id: str):
|
||||||
"""
|
"""
|
||||||
@@ -142,17 +226,19 @@ class RedfinScraper(Scraper):
|
|||||||
home_id = region_id
|
home_id = region_id
|
||||||
return self.handle_address(home_id)
|
return self.handle_address(home_id)
|
||||||
|
|
||||||
url = "https://www.redfin.com/stingray/api/gis?al=1®ion_id={}®ion_type={}".format(
|
if self.listing_type == ListingType.FOR_RENT:
|
||||||
region_id, region_type
|
return self._handle_rentals(region_id, region_type)
|
||||||
)
|
else:
|
||||||
|
if self.listing_type == ListingType.FOR_SALE:
|
||||||
|
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&num_homes=100000"
|
||||||
|
else:
|
||||||
|
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000"
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
|
|
||||||
homes = [
|
homes = [
|
||||||
self._parse_home(home) for home in response_json["payload"]["homes"]
|
self._parse_home(home) for home in response_json["payload"]["homes"]
|
||||||
] + [
|
] + [
|
||||||
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
|
self._parse_building(building)
|
||||||
|
for building in response_json["payload"]["buildings"].values()
|
||||||
]
|
]
|
||||||
|
|
||||||
return homes
|
return homes
|
||||||
|
|||||||
@@ -1,21 +1,39 @@
|
|||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from ..models import Property, Address, Building, ListingType, PropertyType
|
|
||||||
from ....exceptions import NoResultsFound, PropertyNotFound
|
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
|
from ....utils import parse_address_two, parse_unit
|
||||||
|
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||||
|
from ..models import Property, Address, ListingType, PropertyType
|
||||||
|
|
||||||
|
|
||||||
class ZillowScraper(Scraper):
|
class ZillowScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
self.listing_type = scraper_input.listing_type
|
|
||||||
|
if not self.is_plausible_location(self.location):
|
||||||
|
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
||||||
|
|
||||||
if self.listing_type == ListingType.FOR_SALE:
|
if self.listing_type == ListingType.FOR_SALE:
|
||||||
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
||||||
elif self.listing_type == ListingType.FOR_RENT:
|
elif self.listing_type == ListingType.FOR_RENT:
|
||||||
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
|
||||||
|
else:
|
||||||
|
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
||||||
|
|
||||||
|
def is_plausible_location(self, location: str) -> bool:
|
||||||
|
url = (
|
||||||
|
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
|
||||||
|
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
|
||||||
|
).format(location)
|
||||||
|
|
||||||
|
response = self.session.get(url)
|
||||||
|
|
||||||
|
return response.json()["results"] != []
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
resp = self.session.get(self.url, headers=self._get_headers())
|
resp = self.session.get(
|
||||||
|
self.url, headers=self._get_headers()
|
||||||
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
content = resp.text
|
content = resp.text
|
||||||
|
|
||||||
@@ -33,10 +51,17 @@ class ZillowScraper(Scraper):
|
|||||||
data = json.loads(json_str)
|
data = json.loads(json_str)
|
||||||
|
|
||||||
if "searchPageState" in data["props"]["pageProps"]:
|
if "searchPageState" in data["props"]["pageProps"]:
|
||||||
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
|
pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};'
|
||||||
"searchResults"
|
|
||||||
]["listResults"]
|
match = re.search(pattern, content)
|
||||||
return [self._parse_home(house) for house in houses]
|
|
||||||
|
if match:
|
||||||
|
coords = [float(coord) for coord in match.groups()]
|
||||||
|
return self._fetch_properties_backend(coords)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise GeoCoordsNotFound("Box bounds could not be located.")
|
||||||
|
|
||||||
elif "gdpClientCache" in data["props"]["pageProps"]:
|
elif "gdpClientCache" in data["props"]["pageProps"]:
|
||||||
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
|
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
|
||||||
main_key = list(gdp_client_cache.keys())[0]
|
main_key = list(gdp_client_cache.keys())[0]
|
||||||
@@ -45,47 +70,166 @@ class ZillowScraper(Scraper):
|
|||||||
property = self._get_single_property_page(property_data)
|
property = self._get_single_property_page(property_data)
|
||||||
|
|
||||||
return [property]
|
return [property]
|
||||||
raise PropertyNotFound("Specific property data not found in the response.")
|
raise NoResultsFound("Specific property data not found in the response.")
|
||||||
|
|
||||||
def _parse_home(self, home: dict):
|
def _fetch_properties_backend(self, coords):
|
||||||
"""
|
url = "https://www.zillow.com/async-create-search-page-state"
|
||||||
This method is used when a user enters a generic location & zillow returns more than one property
|
|
||||||
"""
|
filter_state_for_sale = {
|
||||||
url = (
|
"sortSelection": {
|
||||||
f"https://www.zillow.com{home['detailUrl']}"
|
# "value": "globalrelevanceex"
|
||||||
if "zillow.com" not in home["detailUrl"]
|
"value": "days"
|
||||||
else home["detailUrl"]
|
},
|
||||||
|
"isAllHomes": {"value": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_state_for_rent = {
|
||||||
|
"isForRent": {"value": True},
|
||||||
|
"isForSaleByAgent": {"value": False},
|
||||||
|
"isForSaleByOwner": {"value": False},
|
||||||
|
"isNewConstruction": {"value": False},
|
||||||
|
"isComingSoon": {"value": False},
|
||||||
|
"isAuction": {"value": False},
|
||||||
|
"isForSaleForeclosure": {"value": False},
|
||||||
|
"isAllHomes": {"value": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_state_sold = {
|
||||||
|
"isRecentlySold": {"value": True},
|
||||||
|
"isForSaleByAgent": {"value": False},
|
||||||
|
"isForSaleByOwner": {"value": False},
|
||||||
|
"isNewConstruction": {"value": False},
|
||||||
|
"isComingSoon": {"value": False},
|
||||||
|
"isAuction": {"value": False},
|
||||||
|
"isForSaleForeclosure": {"value": False},
|
||||||
|
"isAllHomes": {"value": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
selected_filter = (
|
||||||
|
filter_state_for_rent
|
||||||
|
if self.listing_type == ListingType.FOR_RENT
|
||||||
|
else filter_state_for_sale
|
||||||
|
if self.listing_type == ListingType.FOR_SALE
|
||||||
|
else filter_state_sold
|
||||||
)
|
)
|
||||||
|
|
||||||
if "hdpData" in home and "homeInfo" in home["hdpData"]:
|
payload = {
|
||||||
price_data = self._extract_price(home)
|
"searchQueryState": {
|
||||||
address = self._extract_address(home)
|
"pagination": {},
|
||||||
agent_name = self._extract_agent_name(home)
|
"isMapVisible": True,
|
||||||
beds = home["hdpData"]["homeInfo"]["bedrooms"]
|
"mapBounds": {
|
||||||
baths = home["hdpData"]["homeInfo"]["bathrooms"]
|
"west": coords[0],
|
||||||
property_type = home["hdpData"]["homeInfo"].get("homeType")
|
"east": coords[1],
|
||||||
|
"south": coords[2],
|
||||||
return Property(
|
"north": coords[3],
|
||||||
site_name=self.site_name,
|
},
|
||||||
address=address,
|
"filterState": selected_filter,
|
||||||
agent_name=agent_name,
|
"isListVisible": True,
|
||||||
url=url,
|
"mapZoom": 11,
|
||||||
beds=beds,
|
},
|
||||||
baths=baths,
|
"wants": {"cat1": ["mapResults"]},
|
||||||
listing_type=self.listing_type,
|
"isDebugRequest": False,
|
||||||
property_type=PropertyType(property_type),
|
}
|
||||||
**price_data,
|
resp = self.session.put(
|
||||||
|
url, headers=self._get_headers(), json=payload
|
||||||
)
|
)
|
||||||
else:
|
resp.raise_for_status()
|
||||||
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
|
a = resp.json()
|
||||||
address_one, city, state, zip_code = (home[key] for key in keys)
|
return self._parse_properties(resp.json())
|
||||||
address_one, address_two = self._parse_address_two(address_one)
|
|
||||||
address = Address(address_one, city, state, zip_code, address_two)
|
|
||||||
|
|
||||||
building_info = self._extract_building_info(home)
|
def _parse_properties(self, property_data: dict):
|
||||||
return Building(
|
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
|
||||||
site_name=self.site_name, address=address, url=url, **building_info
|
|
||||||
|
properties_list = []
|
||||||
|
|
||||||
|
for result in mapresults:
|
||||||
|
if "hdpData" in result:
|
||||||
|
home_info = result["hdpData"]["homeInfo"]
|
||||||
|
address_data = {
|
||||||
|
"street_address": parse_address_two(home_info["streetAddress"])[0],
|
||||||
|
"unit": parse_unit(home_info["unit"])
|
||||||
|
if "unit" in home_info
|
||||||
|
else None,
|
||||||
|
"city": home_info["city"],
|
||||||
|
"state": home_info["state"],
|
||||||
|
"zip_code": home_info["zipcode"],
|
||||||
|
"country": home_info["country"],
|
||||||
|
}
|
||||||
|
property_data = {
|
||||||
|
"site_name": self.site_name,
|
||||||
|
"address": Address(**address_data),
|
||||||
|
"property_url": f"https://www.zillow.com{result['detailUrl']}",
|
||||||
|
"beds": int(home_info["bedrooms"])
|
||||||
|
if "bedrooms" in home_info
|
||||||
|
else None,
|
||||||
|
"baths": home_info.get("bathrooms"),
|
||||||
|
"square_feet": int(home_info["livingArea"])
|
||||||
|
if "livingArea" in home_info
|
||||||
|
else None,
|
||||||
|
"currency": home_info["currency"],
|
||||||
|
"price": home_info.get("price"),
|
||||||
|
"tax_assessed_value": int(home_info["taxAssessedValue"])
|
||||||
|
if "taxAssessedValue" in home_info
|
||||||
|
else None,
|
||||||
|
"property_type": PropertyType(home_info["homeType"]),
|
||||||
|
"listing_type": ListingType(
|
||||||
|
home_info["statusType"]
|
||||||
|
if "statusType" in home_info
|
||||||
|
else self.listing_type
|
||||||
|
),
|
||||||
|
"lot_area_value": round(home_info["lotAreaValue"], 2)
|
||||||
|
if "lotAreaValue" in home_info
|
||||||
|
else None,
|
||||||
|
"lot_area_unit": home_info.get("lotAreaUnit"),
|
||||||
|
"latitude": result["latLong"]["latitude"],
|
||||||
|
"longitude": result["latLong"]["longitude"],
|
||||||
|
"status_text": result.get("statusText"),
|
||||||
|
"posted_time": result["variableData"]["text"]
|
||||||
|
if "variableData" in result
|
||||||
|
and "text" in result["variableData"]
|
||||||
|
and result["variableData"]["type"] == "TIME_ON_INFO"
|
||||||
|
else None,
|
||||||
|
"img_src": result.get("imgSrc"),
|
||||||
|
"price_per_sqft": int(home_info["price"] // home_info["livingArea"])
|
||||||
|
if "livingArea" in home_info
|
||||||
|
and home_info["livingArea"] != 0
|
||||||
|
and "price" in home_info
|
||||||
|
else None,
|
||||||
|
}
|
||||||
|
property_obj = Property(**property_data)
|
||||||
|
properties_list.append(property_obj)
|
||||||
|
|
||||||
|
elif "isBuilding" in result:
|
||||||
|
price = result["price"]
|
||||||
|
building_data = {
|
||||||
|
"property_url": f"https://www.zillow.com{result['detailUrl']}",
|
||||||
|
"site_name": self.site_name,
|
||||||
|
"property_type": PropertyType("BUILDING"),
|
||||||
|
"listing_type": ListingType(result["statusType"]),
|
||||||
|
"img_src": result["imgSrc"],
|
||||||
|
"price": int(price.replace("From $", "").replace(",", ""))
|
||||||
|
if "From $" in price
|
||||||
|
else None,
|
||||||
|
"apt_min_price": int(
|
||||||
|
price.replace("$", "").replace(",", "").replace("+/mo", "")
|
||||||
)
|
)
|
||||||
|
if "+/mo" in price
|
||||||
|
else None,
|
||||||
|
"address": self._extract_address(result["address"]),
|
||||||
|
"bldg_min_beds": result["minBeds"],
|
||||||
|
"currency": "USD",
|
||||||
|
"bldg_min_baths": result["minBaths"],
|
||||||
|
"bldg_min_area": result.get("minArea"),
|
||||||
|
"bldg_unit_count": result["unitCount"],
|
||||||
|
"bldg_name": result.get("communityName"),
|
||||||
|
"status_text": result["statusText"],
|
||||||
|
"latitude": result["latLong"]["latitude"],
|
||||||
|
"longitude": result["latLong"]["longitude"],
|
||||||
|
}
|
||||||
|
building_obj = Property(**building_data)
|
||||||
|
properties_list.append(building_obj)
|
||||||
|
|
||||||
|
return properties_list
|
||||||
|
|
||||||
def _get_single_property_page(self, property_data: dict):
|
def _get_single_property_page(self, property_data: dict):
|
||||||
"""
|
"""
|
||||||
@@ -97,32 +241,38 @@ class ZillowScraper(Scraper):
|
|||||||
else property_data["hdpUrl"]
|
else property_data["hdpUrl"]
|
||||||
)
|
)
|
||||||
address_data = property_data["address"]
|
address_data = property_data["address"]
|
||||||
address_one, address_two = self._parse_address_two(
|
street_address, unit = parse_address_two(address_data["streetAddress"])
|
||||||
address_data["streetAddress"]
|
|
||||||
)
|
|
||||||
address = Address(
|
address = Address(
|
||||||
address_one=address_one,
|
street_address=street_address,
|
||||||
address_two=address_two,
|
unit=unit,
|
||||||
city=address_data["city"],
|
city=address_data["city"],
|
||||||
state=address_data["state"],
|
state=address_data["state"],
|
||||||
zip_code=address_data["zipcode"],
|
zip_code=address_data["zipcode"],
|
||||||
|
country=property_data.get("country"),
|
||||||
)
|
)
|
||||||
property_type = property_data.get("homeType", None)
|
property_type = property_data.get("homeType", None)
|
||||||
|
|
||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
address=address,
|
address=address,
|
||||||
url=url,
|
property_url=url,
|
||||||
beds=property_data.get("bedrooms", None),
|
beds=property_data.get("bedrooms", None),
|
||||||
baths=property_data.get("bathrooms", None),
|
baths=property_data.get("bathrooms", None),
|
||||||
year_built=property_data.get("yearBuilt", None),
|
year_built=property_data.get("yearBuilt", None),
|
||||||
price=property_data.get("price", None),
|
price=property_data.get("price", None),
|
||||||
lot_size=property_data.get("lotSize", None),
|
tax_assessed_value=property_data.get("taxAssessedValue", None),
|
||||||
|
latitude=property_data.get("latitude"),
|
||||||
|
longitude=property_data.get("longitude"),
|
||||||
|
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
|
||||||
|
currency=property_data.get("currency", None),
|
||||||
|
lot_area_value=property_data.get("lotAreaValue"),
|
||||||
|
lot_area_unit=property_data["lotAreaUnits"].lower()
|
||||||
|
if "lotAreaUnits" in property_data
|
||||||
|
else None,
|
||||||
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
|
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
|
||||||
stories=property_data.get("resoFacts", {}).get("stories", None),
|
stories=property_data.get("resoFacts", {}).get("stories", None),
|
||||||
description=property_data.get("description", None),
|
description=property_data.get("description", None),
|
||||||
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
|
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
|
||||||
price_per_square_foot=property_data.get("resoFacts", {}).get(
|
price_per_sqft=property_data.get("resoFacts", {}).get(
|
||||||
"pricePerSquareFoot", None
|
"pricePerSquareFoot", None
|
||||||
),
|
),
|
||||||
square_feet=property_data.get("livingArea", None),
|
square_feet=property_data.get("livingArea", None),
|
||||||
@@ -130,81 +280,54 @@ class ZillowScraper(Scraper):
|
|||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_building_info(self, home: dict) -> dict:
|
def _extract_address(self, address_str):
|
||||||
num_units = len(home["units"])
|
"""
|
||||||
prices = [
|
Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
|
||||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
and return an Address object.
|
||||||
for unit in home["units"]
|
"""
|
||||||
]
|
parts = address_str.split(", ")
|
||||||
return {
|
|
||||||
"listing_type": self.listing_type,
|
if len(parts) != 3:
|
||||||
"num_units": len(home["units"]),
|
raise ValueError(f"Unexpected address format: {address_str}")
|
||||||
"min_unit_price": min(
|
|
||||||
(
|
street_address = parts[0].strip()
|
||||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
city = parts[1].strip()
|
||||||
for unit in home["units"]
|
state_zip = parts[2].split(" ")
|
||||||
|
|
||||||
|
if len(state_zip) == 1:
|
||||||
|
state = state_zip[0].strip()
|
||||||
|
zip_code = None
|
||||||
|
elif len(state_zip) == 2:
|
||||||
|
state = state_zip[0].strip()
|
||||||
|
zip_code = state_zip[1].strip()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||||
|
|
||||||
|
street_address, unit = parse_address_two(street_address)
|
||||||
|
return Address(
|
||||||
|
street_address=street_address,
|
||||||
|
city=city,
|
||||||
|
unit=unit,
|
||||||
|
state=state,
|
||||||
|
zip_code=zip_code,
|
||||||
|
country="USA",
|
||||||
)
|
)
|
||||||
),
|
|
||||||
"max_unit_price": max(
|
|
||||||
(
|
|
||||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
|
||||||
for unit in home["units"]
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_price(home: dict) -> dict:
|
|
||||||
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
|
|
||||||
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
|
|
||||||
|
|
||||||
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
|
|
||||||
price_per_square_foot = price // square_feet if square_feet and price else None
|
|
||||||
|
|
||||||
return {
|
|
||||||
k: v
|
|
||||||
for k, v in locals().items()
|
|
||||||
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_agent_name(home: dict) -> str | None:
|
|
||||||
broker_str = home.get("brokerName", "")
|
|
||||||
match = re.search(r"Listing by: (.+)", broker_str)
|
|
||||||
return match.group(1) if match else None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse_address_two(address_one: str):
|
|
||||||
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
|
|
||||||
address_two = apt_match.group().strip() if apt_match else None
|
|
||||||
address_one = (
|
|
||||||
address_one.replace(address_two, "").strip() if address_two else address_one
|
|
||||||
)
|
|
||||||
return address_one, address_two
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_address(home: dict) -> Address:
|
|
||||||
keys = ("streetAddress", "city", "state", "zipcode")
|
|
||||||
address_one, city, state, zip_code = (
|
|
||||||
home["hdpData"]["homeInfo"][key] for key in keys
|
|
||||||
)
|
|
||||||
address_one, address_two = ZillowScraper._parse_address_two(address_one)
|
|
||||||
return Address(address_one, city, state, zip_code, address_two=address_two)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_headers():
|
def _get_headers():
|
||||||
return {
|
return {
|
||||||
"authority": "parser-external.geo.moveaws.com",
|
"authority": "www.zillow.com",
|
||||||
"accept": "*/*",
|
"accept": "*/*",
|
||||||
"accept-language": "en-US,en;q=0.9",
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
|
||||||
"origin": "https://www.zillow.com",
|
"origin": "https://www.zillow.com",
|
||||||
"referer": "https://www.zillow.com/",
|
"referer": "https://www.zillow.com",
|
||||||
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
||||||
"sec-ch-ua-mobile": "?0",
|
"sec-ch-ua-mobile": "?0",
|
||||||
"sec-ch-ua-platform": '"Windows"',
|
"sec-ch-ua-platform": '"Windows"',
|
||||||
"sec-fetch-dest": "empty",
|
"sec-fetch-dest": "empty",
|
||||||
"sec-fetch-mode": "cors",
|
"sec-fetch-mode": "cors",
|
||||||
"sec-fetch-site": "cross-site",
|
"sec-fetch-site": "same-origin",
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,5 +10,5 @@ class NoResultsFound(Exception):
|
|||||||
"""Raised when no results are found for the given location"""
|
"""Raised when no results are found for the given location"""
|
||||||
|
|
||||||
|
|
||||||
class PropertyNotFound(Exception):
|
class GeoCoordsNotFound(Exception):
|
||||||
"""Raised when no property is found for the given address"""
|
"""Raised when no property is found for the given address"""
|
||||||
|
|||||||
48
homeharvest/utils.py
Normal file
48
homeharvest/utils.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def parse_address_two(street_address: str) -> tuple:
|
||||||
|
if not street_address:
|
||||||
|
return street_address, None
|
||||||
|
|
||||||
|
apt_match = re.search(
|
||||||
|
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
|
||||||
|
street_address,
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
if apt_match:
|
||||||
|
apt_str = apt_match.group().strip()
|
||||||
|
cleaned_apt_str = re.sub(
|
||||||
|
r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I
|
||||||
|
)
|
||||||
|
|
||||||
|
main_address = street_address.replace(apt_str, "").strip()
|
||||||
|
return main_address, cleaned_apt_str
|
||||||
|
else:
|
||||||
|
return street_address, None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_unit(street_address: str):
|
||||||
|
if not street_address:
|
||||||
|
return None
|
||||||
|
apt_match = re.search(
|
||||||
|
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
|
||||||
|
street_address,
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
if apt_match:
|
||||||
|
apt_str = apt_match.group().strip()
|
||||||
|
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
|
||||||
|
return apt_str
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(parse_address_two("4303 E Cactus Rd Apt 126"))
|
||||||
|
print(parse_address_two("1234 Elm Street apt 2B"))
|
||||||
|
print(parse_address_two("1234 Elm Street UNIT 3A"))
|
||||||
|
print(parse_address_two("1234 Elm Street unit 3A"))
|
||||||
|
print(parse_address_two("1234 Elm Street SuIte 3A"))
|
||||||
27
poetry.lock
generated
27
poetry.lock
generated
@@ -106,6 +106,17 @@ files = [
|
|||||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "et-xmlfile"
|
||||||
|
version = "1.1.0"
|
||||||
|
description = "An implementation of lxml.xmlfile for the standard library"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
|
||||||
|
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "exceptiongroup"
|
name = "exceptiongroup"
|
||||||
version = "1.1.3"
|
version = "1.1.3"
|
||||||
@@ -217,6 +228,20 @@ files = [
|
|||||||
{file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
|
{file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openpyxl"
|
||||||
|
version = "3.1.2"
|
||||||
|
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
|
||||||
|
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
et-xmlfile = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "packaging"
|
name = "packaging"
|
||||||
version = "23.1"
|
version = "23.1"
|
||||||
@@ -425,4 +450,4 @@ zstd = ["zstandard (>=0.18.0)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "eede625d6d45085e143b0af246cb2ce00cff8579c667be3b63387c8594a5570d"
|
content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f"
|
||||||
|
|||||||
@@ -1,15 +1,19 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.1.3"
|
version = "0.2.5"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
|
[tool.poetry.scripts]
|
||||||
|
homeharvest = "homeharvest.cli:main"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
pandas = "^2.1.0"
|
pandas = "^2.1.0"
|
||||||
|
openpyxl = "^3.1.2"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|||||||
@@ -1,12 +1,40 @@
|
|||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_realtor():
|
def test_realtor():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"),
|
scrape_property(
|
||||||
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format
|
location="2530 Al Lipscomb Way",
|
||||||
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format
|
site_name="realtor.com",
|
||||||
|
listing_type="for_sale",
|
||||||
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
|
||||||
|
), #: does not support "city, state, USA" format
|
||||||
|
scrape_property(
|
||||||
|
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
|
||||||
|
), #: does not support "city, state, USA" format
|
||||||
scrape_property(location="85281", site_name="realtor.com"),
|
scrape_property(location="85281", site_name="realtor.com"),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert all([result is not None for result in results])
|
assert all([result is not None for result in results])
|
||||||
|
|
||||||
|
bad_results = []
|
||||||
|
try:
|
||||||
|
bad_results += [
|
||||||
|
scrape_property(
|
||||||
|
location="abceefg ju098ot498hh9",
|
||||||
|
site_name="realtor.com",
|
||||||
|
listing_type="for_sale",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
||||||
|
assert True
|
||||||
|
|
||||||
|
assert all([result is None for result in bad_results])
|
||||||
|
|||||||
@@ -1,12 +1,38 @@
|
|||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"),
|
scrape_property(
|
||||||
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"),
|
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
|
||||||
scrape_property(location="Dallas, TX, USA", site_name="redfin"),
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
|
||||||
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
|
||||||
|
),
|
||||||
scrape_property(location="85281", site_name="redfin"),
|
scrape_property(location="85281", site_name="redfin"),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert all([result is not None for result in results])
|
assert all([result is not None for result in results])
|
||||||
|
|
||||||
|
bad_results = []
|
||||||
|
try:
|
||||||
|
bad_results += [
|
||||||
|
scrape_property(
|
||||||
|
location="abceefg ju098ot498hh9",
|
||||||
|
site_name="redfin",
|
||||||
|
listing_type="for_sale",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
||||||
|
assert True
|
||||||
|
|
||||||
|
assert all([result is None for result in bad_results])
|
||||||
|
|||||||
@@ -1,12 +1,38 @@
|
|||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_zillow():
|
def test_zillow():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"),
|
scrape_property(
|
||||||
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"),
|
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
|
||||||
scrape_property(location="Dallas, TX, USA", site_name="zillow"),
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
|
||||||
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
|
||||||
|
),
|
||||||
scrape_property(location="85281", site_name="zillow"),
|
scrape_property(location="85281", site_name="zillow"),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert all([result is not None for result in results])
|
assert all([result is not None for result in results])
|
||||||
|
|
||||||
|
bad_results = []
|
||||||
|
try:
|
||||||
|
bad_results += [
|
||||||
|
scrape_property(
|
||||||
|
location="abceefg ju098ot498hh9",
|
||||||
|
site_name="zillow",
|
||||||
|
listing_type="for_sale",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
||||||
|
assert True
|
||||||
|
|
||||||
|
assert all([result is None for result in bad_results])
|
||||||
|
|||||||
Reference in New Issue
Block a user