Compare commits

..

57 Commits

Author SHA1 Message Date
Zachary Hampton
0a39357a07 Merge pull request #12 from ZacharyHampton/proxy_bug
fix: proxy add to session correctly
2023-09-19 14:07:25 -07:00
Cullen Watson
8f06d46ddb chore: version number 2023-09-19 16:07:06 -05:00
Cullen Watson
0dae14ccfc fix: proxy add to session correctly 2023-09-19 16:05:14 -05:00
Zachary Hampton
9aaabdd5d8 Merge pull request #11 from ZacharyHampton/proxy_support
Proxy support
2023-09-19 13:50:14 -07:00
Cullen Watson
cdf41fe9f2 fix: remove self.proxy 2023-09-19 15:49:50 -05:00
Cullen Watson
1f0feb836d refactor: move proxy to session 2023-09-19 15:48:46 -05:00
Cullen Watson
5f31beda46 chore: version number 2023-09-19 15:44:41 -05:00
Cullen Watson
fd9cdea499 feat: proxy support 2023-09-19 15:43:24 -05:00
Zachary Hampton
93a1cbe17f Merge pull request #10 from ZacharyHampton/cli_homeharvest
add cli
2023-09-19 13:07:27 -07:00
Cullen Watson
49d27943c4 add cli 2023-09-19 15:01:39 -05:00
Zachary Hampton
05fca9b7e6 Update README.md 2023-09-19 11:08:08 -07:00
Zachary Hampton
20ce44fb3a - redfin limiting bug fix 2023-09-19 10:37:10 -07:00
Zachary Hampton
52017c1bb5 Merge pull request #9 from ZacharyHampton/redfin_rental_support
feat(redfin): rental support
2023-09-19 10:28:02 -07:00
Cullen Watson
dba1c03081 feat(redfin): add sold listing_type 2023-09-19 12:27:13 -05:00
Cullen Watson
1fc2d8c549 feat(redfin): rental support 2023-09-19 11:58:20 -05:00
Zachary Hampton
02d112eea0 Merge pull request #8 from ZacharyHampton/fix/zillow-location-validation
- zillow location validation
2023-09-19 09:33:33 -07:00
Zachary Hampton
30e510882b - version bump and excel support 2023-09-19 09:26:52 -07:00
Zachary Hampton
78b56c2cac - zillow location validation 2023-09-19 09:25:08 -07:00
Cullen Watson
087854a688 Merge branch 'master' of https://github.com/ZacharyHampton/HomeHarvest 2023-09-19 00:04:03 -05:00
Cullen Watson
80586467a8 docs:add guide 2023-09-18 23:53:10 -05:00
Cullen Watson
3494b152b8 docs: change install cmd 2023-09-18 23:32:51 -05:00
Cullen Watson
6c6fef80ed chore: change version number 2023-09-18 23:16:54 -05:00
Cullen Watson
62e3321277 fix(zillow): test case 2023-09-18 22:59:49 -05:00
Zachary Hampton
80186ee8c5 Merge remote-tracking branch 'origin/master'
# Conflicts:
#	homeharvest/__init__.py
2023-09-18 20:28:16 -07:00
Zachary Hampton
3ec47c5b6a - invalid test cases
- redfin and realtor bug fixes
- dupe check bug fix
2023-09-18 20:28:03 -07:00
Cullen Watson
42e8ac4de9 fix: drop dups if cols exist 2023-09-18 22:24:14 -05:00
Cullen Watson
e1917009ae docs: add gif 2023-09-18 21:47:55 -05:00
Zachary Hampton
7297f0eb33 Merge pull request #6 from ZacharyHampton/tidy_up_readme
Minor fixes
2023-09-18 19:04:08 -07:00
Cullen Watson
2eec389838 docs: add logo 2023-09-18 21:02:12 -05:00
Cullen Watson
b01162161d chore: merge 2023-09-18 20:09:28 -05:00
Cullen Watson
906ce92685 Merge remote-tracking branch 'origin' into tidy_up_readme 2023-09-18 20:01:59 -05:00
Cullen Watson
cc76e067b2 fix: lat/long KeyError 2023-09-18 20:01:55 -05:00
Zachary Hampton
1f0c351974 Merge pull request #4 from ZacharyHampton/tidy_up_readme
docs: readme
2023-09-18 17:47:13 -07:00
Zachary Hampton
a1684f87db Update pyproject.toml 2023-09-18 17:46:58 -07:00
Zachary Hampton
2ae3ebe28e Merge pull request #5 from ZacharyHampton/ZacharyHampton-patch-1
Update README.md
2023-09-18 17:45:48 -07:00
Zachary Hampton
ae3961514b Update README.md 2023-09-18 17:45:14 -07:00
Cullen Watson
0621b01d9a docs: readme 2023-09-18 19:40:49 -05:00
Cullen Watson
fbbd56d930 docs: remove proxy usage 2023-09-18 19:39:22 -05:00
Cullen Watson
82092faa28 docs: readme 2023-09-18 19:35:38 -05:00
Zachary Hampton
8f90a80b0a - lat lon on realtor & redfin 2023-09-18 16:22:47 -07:00
Zachary Hampton
d5b4d80f96 Merge pull request #3 from ZacharyHampton/all_3_sites
Check dups with city, street_address, unit
2023-09-18 16:00:27 -07:00
Cullen Watson
086bcfd224 fix: check for suite 2023-09-18 17:57:15 -05:00
Cullen Watson
4726764482 refactor: merge master 2023-09-18 17:46:05 -05:00
Cullen Watson
ca260fd2b4 fix: filter dup on street, unit, city 2023-09-18 17:42:16 -05:00
Zachary Hampton
94e5b090da - refactor 2023-09-18 15:22:43 -07:00
Zachary Hampton
d0a6a66b6a Merge pull request #2 from ZacharyHampton/all_3_sites
feat: run all 3 sites with one call
2023-09-18 15:17:50 -07:00
Cullen Watson
8e140a0e45 chore: format 2023-09-18 17:04:54 -05:00
Cullen Watson
588689c230 fix: normalize unit num 2023-09-18 17:04:34 -05:00
Cullen Watson
c7a4bfd5e4 feat: run all 3 sites with one scrape_property() call 2023-09-18 16:18:22 -05:00
Zachary Hampton
fe351ab57c Merge pull request #1 from ZacharyHampton/zillow_backend_ep 2023-09-18 13:52:43 -07:00
Cullen Watson
5d0f519a85 chore: update version number 2023-09-18 15:44:13 -05:00
Cullen Watson
869d7e7c51 refator(realtor): fit to updated models 2023-09-18 15:43:44 -05:00
Cullen Watson
ffd3ce6aed reactor(redfin) 2023-09-18 14:36:18 -05:00
Cullen Watson
471e53118e refactor(redfin): fit to use updated models 2023-09-18 14:07:37 -05:00
Cullen Watson
dc8c15959f fix: use zillow backend ep 2023-09-18 13:38:17 -05:00
Zachary Hampton
10c01f373e Update README.md
try with replit
2023-09-18 10:01:52 -07:00
Zachary Hampton
fd01bfb8b8 Update README.md 2023-09-18 08:45:31 -07:00
17 changed files with 1123 additions and 357 deletions

1
.gitignore vendored
View File

@@ -4,3 +4,4 @@
**/.pytest_cache/ **/.pytest_cache/
*.pyc *.pyc
/.ipynb_checkpoints/ /.ipynb_checkpoints/
*.csv

View File

@@ -31,8 +31,30 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# scrapes all 3 sites by default\n",
"scrape_property(\n", "scrape_property(\n",
" location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n", " location=\"dallas\",\n",
" listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aaf86093",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# search a specific address\n",
"scrape_property(\n",
" location=\"2530 Al Lipscomb Way\",\n",
" site_name=\"zillow\",\n",
" listing_type=\"for_sale\"\n",
")" ")"
] ]
}, },
@@ -43,8 +65,31 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# check rentals\n",
"scrape_property(\n", "scrape_property(\n",
" location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n", " location=\"chicago, illinois\",\n",
" site_name=[\"redfin\", \"zillow\"],\n",
" listing_type=\"for_rent\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af280cd3",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# check sold properties\n",
"scrape_property(\n",
" location=\"90210\",\n",
" site_name=[\"redfin\"],\n",
" listing_type=\"sold\"\n",
")" ")"
] ]
} }

164
README.md
View File

@@ -1,33 +1,165 @@
# HomeHarvest <img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
**HomeHarvest** aims to be the top Python real estate scraping library. **HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._ [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
## Features
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
- Aggregates the properties in a Pandas DataFrame
[Video Guide for HomeHarvest](https://www.youtube.com/watch?v=HCoHoiJdWQY)
![homeharvest](https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/b3d5d727-e67b-4a9f-85d8-1e65fd18620a)
## Installation ## Installation
```bash ```bash
pip install --upgrade homeharvest pip install --force-reinstall homeharvest
```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
## Usage
### CLI
```bash
homeharvest "San Francisco, CA" -s zillow realtor.com redfin -l for_rent -o excel -f HomeHarvest
``` ```
## Example Usage This will scrape properties from the specified sites for the given location and listing type, and save the results to an Excel file named `HomeHarvest.xlsx`.
```
By default:
- If `-s` or `--site_name` is not provided, it will scrape from all available sites.
- If `-l` or `--listing_type` is left blank, the default is `for_sale`. Other options are `for_rent` or `sold`.
- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
### Python
```py
from homeharvest import scrape_property from homeharvest import scrape_property
import pandas as pd
properties = scrape_property( properties: pd.DataFrame = scrape_property(
location="85281", site_name="zillow", listing_type="for_rent" site_name=["zillow", "realtor.com", "redfin"],
location="85281",
listing_type="for_rent" # for_sale / sold
) )
#: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel().
print(properties) print(properties)
``` ```
### Site Name Options ## Output
```py
>>> properties.head()
property_url site_name listing_type apt_min_price apt_max_price ...
0 https://www.redfin.com/AZ/Tempe/1003-W-Washing... redfin for_rent 1666.0 2750.0 ...
1 https://www.redfin.com/AZ/Tempe/VELA-at-Town-L... redfin for_rent 1665.0 3763.0 ...
2 https://www.redfin.com/AZ/Tempe/Camden-Tempe/a... redfin for_rent 1939.0 3109.0 ...
3 https://www.redfin.com/AZ/Tempe/Emerson-Park/a... redfin for_rent 1185.0 1817.0 ...
4 https://www.redfin.com/AZ/Tempe/Rio-Paradiso-A... redfin for_rent 1470.0 2235.0 ...
[5 rows x 41 columns]
```
- `zillow` ### Parameters for `scrape_properties()`
- `redfin` ```plaintext
- `realtor.com` Required
├── location (str): address in various formats e.g. just zip, full address, city/state, etc.
└── listing_type (enum): for_rent, for_sale, sold
Optional
├── site_name (List[enum], default=all three sites): zillow, realtor.com, redfin
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
```
### Listing Types ### Property Schema
```plaintext
Property
├── Basic Information:
│ ├── property_url (str)
│ ├── site_name (enum): zillow, redfin, realtor.com
│ ├── listing_type (enum: ListingType)
│ └── property_type (enum): house, apartment, condo, townhouse, single_family, multi_family, building
├── Address Details:
│ ├── street_address (str)
│ ├── city (str)
│ ├── state (str)
│ ├── zip_code (str)
│ ├── unit (str)
│ └── country (str)
├── Property Features:
│ ├── price (int)
│ ├── tax_assessed_value (int)
│ ├── currency (str)
│ ├── square_feet (int)
│ ├── beds (int)
│ ├── baths (float)
│ ├── lot_area_value (float)
│ ├── lot_area_unit (str)
│ ├── stories (int)
│ └── year_built (int)
├── Miscellaneous Details:
│ ├── price_per_sqft (int)
│ ├── mls_id (str)
│ ├── agent_name (str)
│ ├── img_src (str)
│ ├── description (str)
│ ├── status_text (str)
│ ├── latitude (float)
│ ├── longitude (float)
│ └── posted_time (str) [Only for Zillow]
├── Building Details (for property_type: building):
│ ├── bldg_name (str)
│ ├── bldg_unit_count (int)
│ ├── bldg_min_beds (int)
│ ├── bldg_min_baths (float)
│ └── bldg_min_area (int)
└── Apartment Details (for property type: apartment):
├── apt_min_beds: int
├── apt_max_beds: int
├── apt_min_baths: float
├── apt_max_baths: float
├── apt_min_price: int
├── apt_max_price: int
├── apt_min_sqft: int
├── apt_max_sqft: int
```
## Supported Countries for Property Scraping
* **Zillow**: contains listings in the **US** & **Canada**
* **Realtor.com**: mainly from the **US** but also has international listings
* **Redfin**: listings mainly in the **US**, **Canada**, & has expanded to some areas in **Mexico**
### Exceptions
The following exceptions may be raised when using HomeHarvest:
- `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com`
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
- `NoResultsFound` - no properties found from your input
- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the location you input
## Frequently Asked Questions
---
**Q: Encountering issues with your queries?**
**A:** Try a single site and/or broaden the location. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues).
---
**Q: Received a Forbidden 403 response code?**
**A:** This indicates that you have been blocked by the real estate site for sending too many requests. Currently, **Zillow** is particularly aggressive with blocking. We recommend:
- Waiting a few seconds between requests.
- Trying a VPN to change your IP address.
---
- `for_rent`
- `for_sale`
- `sold`

View File

@@ -1,11 +1,14 @@
import pandas as pd
from typing import Union
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from .core.scrapers import ScraperInput
from .core.scrapers.redfin import RedfinScraper from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building, SiteName from .core.scrapers.models import ListingType, Property, SiteName
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType from .exceptions import InvalidSite, InvalidListingType
from typing import Union
import pandas as pd
_scrapers = { _scrapers = {
@@ -15,7 +18,7 @@ _scrapers = {
} }
def validate_input(site_name: str, listing_type: str) -> None: def _validate_input(site_name: str, listing_type: str) -> None:
if site_name.lower() not in _scrapers: if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.") raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
@@ -25,93 +28,161 @@ def validate_input(site_name: str, listing_type: str) -> None:
) )
def get_ordered_properties(result: Union[Building, Property]) -> list[str]: def _get_ordered_properties(result: Property) -> list[str]:
if isinstance(result, Property):
return [ return [
"property_url",
"site_name",
"listing_type", "listing_type",
"address_one", "property_type",
"status_text",
"currency",
"price",
"apt_min_price",
"apt_max_price",
"apt_min_sqft",
"apt_max_sqft",
"apt_min_beds",
"apt_max_beds",
"apt_min_baths",
"apt_max_baths",
"tax_assessed_value",
"square_feet",
"price_per_sqft",
"beds",
"baths",
"lot_area_value",
"lot_area_unit",
"street_address",
"unit",
"city", "city",
"state", "state",
"zip_code", "zip_code",
"address_two", "country",
"url", "posted_time",
"property_type", "bldg_min_beds",
"price", "bldg_min_baths",
"beds", "bldg_min_area",
"baths", "bldg_unit_count",
"square_feet", "bldg_name",
"price_per_square_foot",
"lot_size",
"stories", "stories",
"year_built", "year_built",
"agent_name", "agent_name",
"mls_id", "mls_id",
"img_src",
"latitude",
"longitude",
"description", "description",
] ]
elif isinstance(result, Building):
return [
"address_one",
"city",
"state",
"zip_code",
"address_two",
"url",
"num_units",
"min_unit_price",
"max_unit_price",
"avg_unit_price",
"listing_type",
]
return []
def process_result(result: Union[Building, Property]) -> pd.DataFrame: def _process_result(result: Property) -> pd.DataFrame:
prop_data = result.__dict__ prop_data = result.__dict__
prop_data["site_name"] = prop_data["site_name"].value
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
if "property_type" in prop_data and prop_data["property_type"] is not None:
prop_data["property_type"] = prop_data["property_type"].value.lower()
else:
prop_data["property_type"] = None
if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["site_name"] = prop_data["site_name"] prop_data["street_address"] = address_data.street_address
prop_data["listing_type"] = prop_data["listing_type"].value prop_data["unit"] = address_data.unit
prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data.get("property_type") else None
prop_data["address_one"] = address_data.address_one
prop_data["city"] = address_data.city prop_data["city"] = address_data.city
prop_data["state"] = address_data.state prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code prop_data["zip_code"] = address_data.zip_code
prop_data["address_two"] = address_data.address_two prop_data["country"] = address_data.country
del prop_data["address"] del prop_data["address"]
properties_df = pd.DataFrame([prop_data]) properties_df = pd.DataFrame([prop_data])
properties_df = properties_df[get_ordered_properties(result)] properties_df = properties_df[_get_ordered_properties(result)]
return properties_df return properties_df
def _scrape_single_site(
location: str, site_name: str, listing_type: str, proxy: str = None
) -> pd.DataFrame:
"""
Helper function to scrape a single site.
"""
_validate_input(site_name, listing_type)
scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
site_name=SiteName.get_by_value(site_name.lower()),
proxy=proxy,
)
site = _scrapers[site_name.lower()](scraper_input)
results = site.search()
properties_dfs = [_process_result(result) for result in results]
properties_dfs = [
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
]
if not properties_dfs:
return pd.DataFrame()
return pd.concat(properties_dfs, ignore_index=True)
def scrape_property( def scrape_property(
location: str, location: str,
site_name: str, site_name: Union[str, list[str]] = None,
listing_type: str = "for_sale", #: for_sale, for_rent, sold listing_type: str = "for_sale",
proxy: str = None,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape property from various sites from a given location and listing type. Scrape property from various sites from a given location and listing type.
:returns: pd.DataFrame :returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way') :param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin') :param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold') :param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties :return: pd.DataFrame containing properties
""" """
if site_name is None:
site_name = list(_scrapers.keys())
validate_input(site_name, listing_type) if not isinstance(site_name, list):
site_name = [site_name]
scraper_input = ScraperInput( results = []
location=location,
listing_type=ListingType[listing_type.upper()], if len(site_name) == 1:
site_name=site_name.lower(), final_df = _scrape_single_site(location, site_name[0], listing_type, proxy)
results.append(final_df)
else:
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(
_scrape_single_site, location, s_name, listing_type, proxy
): s_name
for s_name in site_name
}
for future in concurrent.futures.as_completed(futures):
result = future.result()
results.append(result)
results = [df for df in results if not df.empty and not df.isna().all().all()]
if not results:
return pd.DataFrame()
final_df = pd.concat(results, ignore_index=True)
columns_to_track = ["street_address", "city", "unit"]
#: validate they exist, otherwise create them
for col in columns_to_track:
if col not in final_df.columns:
final_df[col] = None
final_df = final_df.drop_duplicates(
subset=["street_address", "city", "unit"], keep="first"
) )
return final_df
site = _scrapers[site_name.lower()](scraper_input)
results = site.search()
properties_dfs = [process_result(result) for result in results]
return pd.concat(properties_dfs, ignore_index=True)

72
homeharvest/cli.py Normal file
View File

@@ -0,0 +1,72 @@
import argparse
import datetime
from homeharvest import scrape_property
def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument(
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
)
parser.add_argument(
"-s",
"--site_name",
type=str,
nargs="*",
default=None,
help="Site name(s) to scrape from (e.g., realtor, zillow)",
)
parser.add_argument(
"-l",
"--listing_type",
type=str,
default="for_sale",
choices=["for_sale", "for_rent", "sold"],
help="Listing type to scrape",
)
parser.add_argument(
"-o",
"--output",
type=str,
default="excel",
choices=["excel", "csv"],
help="Output format",
)
parser.add_argument(
"-f",
"--filename",
type=str,
default=None,
help="Name of the output file (without extension)",
)
parser.add_argument(
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
)
args = parser.parse_args()
result = scrape_property(
args.location, args.site_name, args.listing_type, proxy=args.proxy
)
if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
args.filename = f"HomeHarvest_{timestamp}"
if args.output == "excel":
output_filename = f"{args.filename}.xlsx"
result.to_excel(output_filename, index=False)
print(f"Excel file saved as {output_filename}")
elif args.output == "csv":
output_filename = f"{args.filename}.csv"
result.to_csv(output_filename, index=False)
print(f"CSV file saved as {output_filename}")
if __name__ == "__main__":
main()

View File

@@ -7,8 +7,8 @@ from .models import Property, ListingType, SiteName
class ScraperInput: class ScraperInput:
location: str location: str
listing_type: ListingType listing_type: ListingType
site_name: str site_name: SiteName
proxy_url: str | None = None proxy: str | None = None
class Scraper: class Scraper:
@@ -17,15 +17,16 @@ class Scraper:
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.session = requests.Session() self.session = requests.Session()
if scraper_input.proxy:
proxy_url = scraper_input.proxy
proxies = {
"http": proxy_url,
"https": proxy_url
}
self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name
if scraper_input.proxy_url:
self.session.proxies = {
"http": scraper_input.proxy_url,
"https": scraper_input.proxy_url,
}
def search(self) -> list[Property]: def search(self) -> list[Property]:
... ...

View File

@@ -7,24 +7,37 @@ class SiteName(Enum):
REDFIN = "redfin" REDFIN = "redfin"
REALTOR = "realtor.com" REALTOR = "realtor.com"
@classmethod
def get_by_value(cls, value):
for item in cls:
if item.value == value:
return item
raise ValueError(f"{value} not found in {cls}")
class ListingType(Enum): class ListingType(Enum):
FOR_SALE = "for_sale" FOR_SALE = "FOR_SALE"
FOR_RENT = "for_rent" FOR_RENT = "FOR_RENT"
SOLD = "sold" SOLD = "SOLD"
class PropertyType(Enum): class PropertyType(Enum):
HOUSE = "HOUSE" HOUSE = "HOUSE"
BUILDING = "BUILDING"
CONDO = "CONDO" CONDO = "CONDO"
TOWNHOUSE = "TOWNHOUSE" TOWNHOUSE = "TOWNHOUSE"
SINGLE_FAMILY = "SINGLE_FAMILY" SINGLE_FAMILY = "SINGLE_FAMILY"
MULTI_FAMILY = "MULTI_FAMILY" MULTI_FAMILY = "MULTI_FAMILY"
MANUFACTURED = "MANUFACTURED" MANUFACTURED = "MANUFACTURED"
NEW_CONSTRUCTION = "NEW_CONSTRUCTION"
APARTMENT = "APARTMENT" APARTMENT = "APARTMENT"
APARTMENTS = "APARTMENTS"
LAND = "LAND" LAND = "LAND"
LOT = "LOT"
OTHER = "OTHER" OTHER = "OTHER"
BLANK = "BLANK"
@classmethod @classmethod
def from_int_code(cls, code): def from_int_code(cls, code):
mapping = { mapping = {
@@ -38,47 +51,62 @@ class PropertyType(Enum):
13: cls.SINGLE_FAMILY, 13: cls.SINGLE_FAMILY,
} }
return mapping.get(code, cls.OTHER) return mapping.get(code, cls.BLANK)
@dataclass @dataclass
class Address: class Address:
address_one: str street_address: str
city: str city: str
state: str state: str
zip_code: str zip_code: str
unit: str | None = None
address_two: str | None = None country: str | None = None
@dataclass()
class Realty:
site_name: str
address: Address
url: str
listing_type: ListingType | None = None
@dataclass @dataclass
class Property(Realty): class Property:
property_url: str
site_name: SiteName
listing_type: ListingType
address: Address
property_type: PropertyType | None = None
# house for sale
price: int | None = None price: int | None = None
tax_assessed_value: int | None = None
currency: str | None = None
square_feet: int | None = None
beds: int | None = None beds: int | None = None
baths: float | None = None baths: float | None = None
lot_area_value: float | None = None
lot_area_unit: str | None = None
stories: int | None = None stories: int | None = None
year_built: int | None = None year_built: int | None = None
square_feet: int | None = None price_per_sqft: int | None = None
price_per_square_foot: int | None = None
mls_id: str | None = None mls_id: str | None = None
agent_name: str | None = None agent_name: str | None = None
property_type: PropertyType | None = None img_src: str | None = None
lot_size: int | None = None
description: str | None = None description: str | None = None
status_text: str | None = None
latitude: float | None = None
longitude: float | None = None
posted_time: str | None = None
# building for sale
bldg_name: str | None = None
bldg_unit_count: int | None = None
bldg_min_beds: int | None = None
bldg_min_baths: float | None = None
bldg_min_area: int | None = None
@dataclass # apt
class Building(Realty): apt_min_beds: int | None = None
num_units: int | None = None apt_max_beds: int | None = None
min_unit_price: int | None = None apt_min_baths: float | None = None
max_unit_price: int | None = None apt_max_baths: float | None = None
avg_unit_price: int | None = None apt_min_price: int | None = None
apt_max_price: int | None = None
apt_min_sqft: int | None = None
apt_max_sqft: int | None = None

View File

@@ -3,6 +3,7 @@ from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any, Generator from typing import Any, Generator
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
from ....utils import parse_address_two, parse_unit
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -29,7 +30,7 @@ class RealtorScraper(Scraper):
params = { params = {
"input": self.location, "input": self.location,
"client_id": self.listing_type.value.replace('_', '-'), "client_id": self.listing_type.value.lower().replace("_", "-"),
"limit": "1", "limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
} }
@@ -43,7 +44,7 @@ class RealtorScraper(Scraper):
result = response_json["autocomplete"] result = response_json["autocomplete"]
if result is None: if not result:
raise NoResultsFound("No results found for location: " + self.location) raise NoResultsFound("No results found for location: " + self.location)
return result[0] return result[0]
@@ -96,46 +97,56 @@ class RealtorScraper(Scraper):
} }
}""" }"""
variables = { variables = {"property_id": property_id}
'property_id': property_id
}
payload = { payload = {
'query': query, "query": query,
'variables': variables, "variables": variables,
} }
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.search_url, json=payload)
response_json = response.json() response_json = response.json()
property_info = response_json['data']['property'] property_info = response_json["data"]["property"]
street_address, unit = parse_address_two(property_info["address"]["line"])
return [Property( return [
Property(
site_name=self.site_name, site_name=self.site_name,
address=Address( address=Address(
address_one=property_info['address']['line'], street_address=street_address,
city=property_info['address']['city'], city=property_info["address"]["city"],
state=property_info['address']['state_code'], state=property_info["address"]["state_code"],
zip_code=property_info['address']['postal_code'], zip_code=property_info["address"]["postal_code"],
unit=unit,
country="USA",
), ),
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'], property_url="https://www.realtor.com/realestateandhomes-detail/"
beds=property_info['basic']['beds'], + property_info["details"]["permalink"],
baths=property_info['basic']['baths'], beds=property_info["basic"]["beds"],
stories=property_info['details']['stories'], baths=property_info["basic"]["baths"],
year_built=property_info['details']['year_built'], stories=property_info["details"]["stories"],
square_feet=property_info['basic']['sqft'], year_built=property_info["details"]["year_built"],
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft'] square_feet=property_info["basic"]["sqft"],
if property_info['basic']['sqft'] is not None and price_per_sqft=property_info["basic"]["price"]
property_info['basic']['price'] is not None // property_info["basic"]["sqft"]
if property_info["basic"]["sqft"] is not None
and property_info["basic"]["price"] is not None
else None, else None,
price=property_info['basic']['price'], price=property_info["basic"]["price"],
mls_id=property_id, mls_id=property_id,
listing_type=self.listing_type, listing_type=self.listing_type,
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None, lot_area_value=property_info["public_record"]["lot_size"]
)] if property_info["public_record"] is not None
else None,
)
]
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: def handle_area(
query = """query Home_search( self, variables: dict, return_total: bool = False
) -> list[Property] | int:
query = (
"""query Home_search(
$city: String, $city: String,
$county: [String], $county: [String],
$state_code: String, $state_code: String,
@@ -184,6 +195,10 @@ class RealtorScraper(Scraper):
street_post_direction street_post_direction
street_suffix street_suffix
unit unit
coordinate {
lon
lat
}
} }
} }
list_price list_price
@@ -193,42 +208,74 @@ class RealtorScraper(Scraper):
} }
} }
} }
}""" % self.listing_type.value }"""
% self.listing_type.value.lower()
)
payload = { payload = {
'query': query, "query": query,
'variables': variables, "variables": variables,
} }
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.search_url, json=payload)
response.raise_for_status()
response_json = response.json() response_json = response.json()
if return_total: if return_total:
return response_json['data']['home_search']['total'] return response_json["data"]["home_search"]["total"]
properties: list[Property] = [] properties: list[Property] = []
for result in response_json['data']['home_search']['results']: if (
response_json is None
or "data" not in response_json
or response_json["data"] is None
or "home_search" not in response_json["data"]
or response_json["data"]["home_search"] is None
or "results" not in response_json["data"]["home_search"]
):
return []
for result in response_json["data"]["home_search"]["results"]:
street_address, unit = parse_address_two(
result["location"]["address"]["line"]
)
realty_property = Property( realty_property = Property(
address=Address( address=Address(
address_one=result['location']['address']['line'], street_address=street_address,
city=result['location']['address']['city'], city=result["location"]["address"]["city"],
state=result['location']['address']['state_code'], state=result["location"]["address"]["state_code"],
zip_code=result['location']['address']['postal_code'], zip_code=result["location"]["address"]["postal_code"],
address_two=result['location']['address']['unit'], unit=parse_unit(result["location"]["address"]["unit"]),
country="USA",
), ),
latitude=result["location"]["address"]["coordinate"]["lat"]
if result
and result.get("location")
and result["location"].get("address")
and result["location"]["address"].get("coordinate")
and "lat" in result["location"]["address"]["coordinate"]
else None,
longitude=result["location"]["address"]["coordinate"]["lon"]
if result
and result.get("location")
and result["location"].get("address")
and result["location"]["address"].get("coordinate")
and "lon" in result["location"]["address"]["coordinate"]
else None,
site_name=self.site_name, site_name=self.site_name,
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'], property_url="https://www.realtor.com/realestateandhomes-detail/"
beds=result['description']['beds'], + result["property_id"],
baths=result['description']['baths'], beds=result["description"]["beds"],
stories=result['description']['stories'], baths=result["description"]["baths"],
year_built=result['description']['year_built'], stories=result["description"]["stories"],
square_feet=result['description']['sqft'], year_built=result["description"]["year_built"],
price_per_square_foot=result['price_per_sqft'], square_feet=result["description"]["sqft"],
price=result['list_price'], price_per_sqft=result["price_per_sqft"],
mls_id=result['property_id'], price=result["list_price"],
mls_id=result["property_id"],
listing_type=self.listing_type, listing_type=self.listing_type,
lot_size=result['description']['lot_sqft'], lot_area_value=result["description"]["lot_sqft"],
) )
properties.append(realty_property) properties.append(realty_property)
@@ -239,17 +286,17 @@ class RealtorScraper(Scraper):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]
if location_type == 'address': if location_type == "address":
property_id = location_info['mpr_id'] property_id = location_info["mpr_id"]
return self.handle_address(property_id) return self.handle_address(property_id)
offset = 0 offset = 0
search_variables = { search_variables = {
'city': location_info.get('city'), "city": location_info.get("city"),
'county': location_info.get('county'), "county": location_info.get("county"),
'state_code': location_info.get('state_code'), "state_code": location_info.get("state_code"),
'postal_code': location_info.get('postal_code'), "postal_code": location_info.get("postal_code"),
'offset': offset, "offset": offset,
} }
total = self.handle_area(search_variables, return_total=True) total = self.handle_area(search_variables, return_total=True)
@@ -258,8 +305,11 @@ class RealtorScraper(Scraper):
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures = [ futures = [
executor.submit( executor.submit(
self.handle_area, variables=search_variables | {'offset': i}, return_total=False self.handle_area,
) for i in range(0, total, 200) variables=search_variables | {"offset": i},
return_total=False,
)
for i in range(0, total, 200)
] ]
for future in as_completed(futures): for future in as_completed(futures):

View File

@@ -1,7 +1,9 @@
import json import json
from ..models import Property, Address, PropertyType, Building
from .. import Scraper
from typing import Any from typing import Any
from .. import Scraper
from ....utils import parse_address_two, parse_unit
from ..models import Property, Address, PropertyType, ListingType, SiteName
from ....exceptions import NoResultsFound
class RedfinScraper(Scraper): class RedfinScraper(Scraper):
@@ -25,6 +27,11 @@ class RedfinScraper(Scraper):
elif match_type == "1": elif match_type == "1":
return "address" #: address, needs to be handled differently return "address" #: address, needs to be handled differently
if "exactMatch" not in response_json["payload"]:
raise NoResultsFound(
"No results found for location: {}".format(self.location)
)
if response_json["payload"]["exactMatch"] is not None: if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"] target = response_json["payload"]["exactMatch"]
else: else:
@@ -38,24 +45,33 @@ class RedfinScraper(Scraper):
return home[key]["value"] return home[key]["value"]
if not single_search: if not single_search:
street_address, unit = parse_address_two(get_value("streetLine"))
unit = parse_unit(get_value("streetLine"))
address = Address( address = Address(
address_one=get_value("streetLine"), street_address=street_address,
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
unit=unit,
country="USA",
) )
else: else:
address_info = home["streetAddress"] address_info = home["streetAddress"]
street_address, unit = parse_address_two(address_info["assembledAddress"])
address = Address( address = Address(
address_one=address_info["assembledAddress"], street_address=street_address,
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
unit=unit,
country="USA",
) )
url = "https://www.redfin.com{}".format(home["url"]) url = "https://www.redfin.com{}".format(home["url"])
property_type = home["propertyType"] if "propertyType" in home else None #: property_type = home["propertyType"] if "propertyType" in home else None
lot_size_data = home.get("lotSize") lot_size_data = home.get("lotSize")
if not isinstance(lot_size_data, int): if not isinstance(lot_size_data, int):
lot_size = ( lot_size = (
lot_size_data.get("value", None) lot_size_data.get("value", None)
@@ -69,7 +85,7 @@ class RedfinScraper(Scraper):
site_name=self.site_name, site_name=self.site_name,
listing_type=self.listing_type, listing_type=self.listing_type,
address=address, address=address,
url=url, property_url=url,
beds=home["beds"] if "beds" in home else None, beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None, baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None, stories=home["stories"] if "stories" in home else None,
@@ -79,40 +95,108 @@ class RedfinScraper(Scraper):
if not single_search if not single_search
else home["yearBuilt"], else home["yearBuilt"],
square_feet=get_value("sqFt"), square_feet=get_value("sqFt"),
lot_size=lot_size, lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")), property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_square_foot=get_value("pricePerSqFt"), price_per_sqft=get_value("pricePerSqFt"),
price=get_value("price"), price=get_value("price"),
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
latitude=home["latLong"]["latitude"]
if "latLong" in home and "latitude" in home["latLong"]
else None,
longitude=home["latLong"]["longitude"]
if "latLong" in home and "longitude" in home["latLong"]
else None,
) )
def _parse_building(self, building: dict) -> Building: def _handle_rentals(self, region_id, region_type):
return Building( url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true&region_id={region_id}&region_type={region_type}&num_homes=100000"
response = self.session.get(url)
response.raise_for_status()
homes = response.json()
properties_list = []
for home in homes["homes"]:
home_data = home["homeData"]
rental_data = home["rentalExtension"]
property_url = f"https://www.redfin.com{home_data.get('url', '')}"
address_info = home_data.get("addressInfo", {})
centroid = address_info.get("centroid", {}).get("centroid", {})
address = Address( address = Address(
address_one=" ".join( street_address=address_info.get("formattedStreetLine", None),
[ city=address_info.get("city", None),
building['address']['streetNumber'], state=address_info.get("state", None),
building['address']['directionalPrefix'], zip_code=address_info.get("zip", None),
building['address']['streetName'], unit=None,
building['address']['streetType'], country="US" if address_info.get("countryCode", None) == 1 else None,
]
),
city=building['address']['city'],
state=building['address']['stateOrProvinceCode'],
zip_code=building['address']['postalCode'],
address_two=" ".join(
[
building['address']['unitType'],
building['address']['unitValue'],
]
)
),
site_name=self.site_name,
url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type,
num_units=building["numUnitsForSale"],
) )
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
bed_range = rental_data.get("bedRange", {"min": None, "max": None})
bath_range = rental_data.get("bathRange", {"min": None, "max": None})
sqft_range = rental_data.get("sqftRange", {"min": None, "max": None})
property_ = Property(
property_url=property_url,
site_name=SiteName.REDFIN,
listing_type=ListingType.FOR_RENT,
address=address,
apt_min_beds=bed_range.get("min", None),
apt_min_baths=bath_range.get("min", None),
apt_max_beds=bed_range.get("max", None),
apt_max_baths=bath_range.get("max", None),
description=rental_data.get("description", None),
latitude=centroid.get("latitude", None),
longitude=centroid.get("longitude", None),
apt_min_price=price_range.get("min", None),
apt_max_price=price_range.get("max", None),
apt_min_sqft=sqft_range.get("min", None),
apt_max_sqft=sqft_range.get("max", None),
img_src=home_data.get("staticMapUrl", None),
posted_time=rental_data.get("lastUpdated", None),
bldg_name=rental_data.get("propertyName", None),
)
properties_list.append(property_)
if not properties_list:
raise NoResultsFound("No rentals found for the given location.")
return properties_list
def _parse_building(self, building: dict) -> Property:
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
street_address, unit = parse_address_two(street_address)
return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address(
street_address=street_address,
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"],
unit=parse_unit(
" ".join(
[
building["address"]["unitType"],
building["address"]["unitValue"],
]
)
),
),
property_url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type,
bldg_unit_count=building["numUnitsForSale"],
)
def handle_address(self, home_id: str): def handle_address(self, home_id: str):
""" """
@@ -142,17 +226,19 @@ class RedfinScraper(Scraper):
home_id = region_id home_id = region_id
return self.handle_address(home_id) return self.handle_address(home_id)
url = "https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}".format( if self.listing_type == ListingType.FOR_RENT:
region_id, region_type return self._handle_rentals(region_id, region_type)
) else:
if self.listing_type == ListingType.FOR_SALE:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&num_homes=100000"
else:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", "")) response_json = json.loads(response.text.replace("{}&&", ""))
homes = [ homes = [
self._parse_home(home) for home in response_json["payload"]["homes"] self._parse_home(home) for home in response_json["payload"]["homes"]
] + [ ] + [
self._parse_building(building) for building in response_json["payload"]["buildings"].values() self._parse_building(building)
for building in response_json["payload"]["buildings"].values()
] ]
return homes return homes

View File

@@ -1,21 +1,39 @@
import re import re
import json import json
from ..models import Property, Address, Building, ListingType, PropertyType
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two, parse_unit
from ....exceptions import GeoCoordsNotFound, NoResultsFound
from ..models import Property, Address, ListingType, PropertyType
class ZillowScraper(Scraper): class ZillowScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
self.listing_type = scraper_input.listing_type
if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location))
if self.listing_type == ListingType.FOR_SALE: if self.listing_type == ListingType.FOR_SALE:
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/" self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
elif self.listing_type == ListingType.FOR_RENT: elif self.listing_type == ListingType.FOR_RENT:
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/" self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
else:
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
def is_plausible_location(self, location: str) -> bool:
url = (
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
).format(location)
response = self.session.get(url)
return response.json()["results"] != []
def search(self): def search(self):
resp = self.session.get(self.url, headers=self._get_headers()) resp = self.session.get(
self.url, headers=self._get_headers()
)
resp.raise_for_status() resp.raise_for_status()
content = resp.text content = resp.text
@@ -33,10 +51,17 @@ class ZillowScraper(Scraper):
data = json.loads(json_str) data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]: if "searchPageState" in data["props"]["pageProps"]:
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][ pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};'
"searchResults"
]["listResults"] match = re.search(pattern, content)
return [self._parse_home(house) for house in houses]
if match:
coords = [float(coord) for coord in match.groups()]
return self._fetch_properties_backend(coords)
else:
raise GeoCoordsNotFound("Box bounds could not be located.")
elif "gdpClientCache" in data["props"]["pageProps"]: elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"]) gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0] main_key = list(gdp_client_cache.keys())[0]
@@ -45,47 +70,166 @@ class ZillowScraper(Scraper):
property = self._get_single_property_page(property_data) property = self._get_single_property_page(property_data)
return [property] return [property]
raise PropertyNotFound("Specific property data not found in the response.") raise NoResultsFound("Specific property data not found in the response.")
def _parse_home(self, home: dict): def _fetch_properties_backend(self, coords):
""" url = "https://www.zillow.com/async-create-search-page-state"
This method is used when a user enters a generic location & zillow returns more than one property
""" filter_state_for_sale = {
url = ( "sortSelection": {
f"https://www.zillow.com{home['detailUrl']}" # "value": "globalrelevanceex"
if "zillow.com" not in home["detailUrl"] "value": "days"
else home["detailUrl"] },
"isAllHomes": {"value": True},
}
filter_state_for_rent = {
"isForRent": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
filter_state_sold = {
"isRecentlySold": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
selected_filter = (
filter_state_for_rent
if self.listing_type == ListingType.FOR_RENT
else filter_state_for_sale
if self.listing_type == ListingType.FOR_SALE
else filter_state_sold
) )
if "hdpData" in home and "homeInfo" in home["hdpData"]: payload = {
price_data = self._extract_price(home) "searchQueryState": {
address = self._extract_address(home) "pagination": {},
agent_name = self._extract_agent_name(home) "isMapVisible": True,
beds = home["hdpData"]["homeInfo"]["bedrooms"] "mapBounds": {
baths = home["hdpData"]["homeInfo"]["bathrooms"] "west": coords[0],
property_type = home["hdpData"]["homeInfo"].get("homeType") "east": coords[1],
"south": coords[2],
return Property( "north": coords[3],
site_name=self.site_name, },
address=address, "filterState": selected_filter,
agent_name=agent_name, "isListVisible": True,
url=url, "mapZoom": 11,
beds=beds, },
baths=baths, "wants": {"cat1": ["mapResults"]},
listing_type=self.listing_type, "isDebugRequest": False,
property_type=PropertyType(property_type), }
**price_data, resp = self.session.put(
url, headers=self._get_headers(), json=payload
) )
else: resp.raise_for_status()
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode") a = resp.json()
address_one, city, state, zip_code = (home[key] for key in keys) return self._parse_properties(resp.json())
address_one, address_two = self._parse_address_two(address_one)
address = Address(address_one, city, state, zip_code, address_two)
building_info = self._extract_building_info(home) def _parse_properties(self, property_data: dict):
return Building( mapresults = property_data["cat1"]["searchResults"]["mapResults"]
site_name=self.site_name, address=address, url=url, **building_info
properties_list = []
for result in mapresults:
if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"]
address_data = {
"street_address": parse_address_two(home_info["streetAddress"])[0],
"unit": parse_unit(home_info["unit"])
if "unit" in home_info
else None,
"city": home_info["city"],
"state": home_info["state"],
"zip_code": home_info["zipcode"],
"country": home_info["country"],
}
property_data = {
"site_name": self.site_name,
"address": Address(**address_data),
"property_url": f"https://www.zillow.com{result['detailUrl']}",
"beds": int(home_info["bedrooms"])
if "bedrooms" in home_info
else None,
"baths": home_info.get("bathrooms"),
"square_feet": int(home_info["livingArea"])
if "livingArea" in home_info
else None,
"currency": home_info["currency"],
"price": home_info.get("price"),
"tax_assessed_value": int(home_info["taxAssessedValue"])
if "taxAssessedValue" in home_info
else None,
"property_type": PropertyType(home_info["homeType"]),
"listing_type": ListingType(
home_info["statusType"]
if "statusType" in home_info
else self.listing_type
),
"lot_area_value": round(home_info["lotAreaValue"], 2)
if "lotAreaValue" in home_info
else None,
"lot_area_unit": home_info.get("lotAreaUnit"),
"latitude": result["latLong"]["latitude"],
"longitude": result["latLong"]["longitude"],
"status_text": result.get("statusText"),
"posted_time": result["variableData"]["text"]
if "variableData" in result
and "text" in result["variableData"]
and result["variableData"]["type"] == "TIME_ON_INFO"
else None,
"img_src": result.get("imgSrc"),
"price_per_sqft": int(home_info["price"] // home_info["livingArea"])
if "livingArea" in home_info
and home_info["livingArea"] != 0
and "price" in home_info
else None,
}
property_obj = Property(**property_data)
properties_list.append(property_obj)
elif "isBuilding" in result:
price = result["price"]
building_data = {
"property_url": f"https://www.zillow.com{result['detailUrl']}",
"site_name": self.site_name,
"property_type": PropertyType("BUILDING"),
"listing_type": ListingType(result["statusType"]),
"img_src": result["imgSrc"],
"price": int(price.replace("From $", "").replace(",", ""))
if "From $" in price
else None,
"apt_min_price": int(
price.replace("$", "").replace(",", "").replace("+/mo", "")
) )
if "+/mo" in price
else None,
"address": self._extract_address(result["address"]),
"bldg_min_beds": result["minBeds"],
"currency": "USD",
"bldg_min_baths": result["minBaths"],
"bldg_min_area": result.get("minArea"),
"bldg_unit_count": result["unitCount"],
"bldg_name": result.get("communityName"),
"status_text": result["statusText"],
"latitude": result["latLong"]["latitude"],
"longitude": result["latLong"]["longitude"],
}
building_obj = Property(**building_data)
properties_list.append(building_obj)
return properties_list
def _get_single_property_page(self, property_data: dict): def _get_single_property_page(self, property_data: dict):
""" """
@@ -97,32 +241,38 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
address_one, address_two = self._parse_address_two( street_address, unit = parse_address_two(address_data["streetAddress"])
address_data["streetAddress"]
)
address = Address( address = Address(
address_one=address_one, street_address=street_address,
address_two=address_two, unit=unit,
city=address_data["city"], city=address_data["city"],
state=address_data["state"], state=address_data["state"],
zip_code=address_data["zipcode"], zip_code=address_data["zipcode"],
country=property_data.get("country"),
) )
property_type = property_data.get("homeType", None) property_type = property_data.get("homeType", None)
return Property( return Property(
site_name=self.site_name, site_name=self.site_name,
address=address, address=address,
url=url, property_url=url,
beds=property_data.get("bedrooms", None), beds=property_data.get("bedrooms", None),
baths=property_data.get("bathrooms", None), baths=property_data.get("bathrooms", None),
year_built=property_data.get("yearBuilt", None), year_built=property_data.get("yearBuilt", None),
price=property_data.get("price", None), price=property_data.get("price", None),
lot_size=property_data.get("lotSize", None), tax_assessed_value=property_data.get("taxAssessedValue", None),
latitude=property_data.get("latitude"),
longitude=property_data.get("longitude"),
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
currency=property_data.get("currency", None),
lot_area_value=property_data.get("lotAreaValue"),
lot_area_unit=property_data["lotAreaUnits"].lower()
if "lotAreaUnits" in property_data
else None,
agent_name=property_data.get("attributionInfo", {}).get("agentName", None), agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
stories=property_data.get("resoFacts", {}).get("stories", None), stories=property_data.get("resoFacts", {}).get("stories", None),
description=property_data.get("description", None), description=property_data.get("description", None),
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None), mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
price_per_square_foot=property_data.get("resoFacts", {}).get( price_per_sqft=property_data.get("resoFacts", {}).get(
"pricePerSquareFoot", None "pricePerSquareFoot", None
), ),
square_feet=property_data.get("livingArea", None), square_feet=property_data.get("livingArea", None),
@@ -130,81 +280,54 @@ class ZillowScraper(Scraper):
listing_type=self.listing_type, listing_type=self.listing_type,
) )
def _extract_building_info(self, home: dict) -> dict: def _extract_address(self, address_str):
num_units = len(home["units"]) """
prices = [ Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) and return an Address object.
for unit in home["units"] """
] parts = address_str.split(", ")
return {
"listing_type": self.listing_type, if len(parts) != 3:
"num_units": len(home["units"]), raise ValueError(f"Unexpected address format: {address_str}")
"min_unit_price": min(
( street_address = parts[0].strip()
int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) city = parts[1].strip()
for unit in home["units"] state_zip = parts[2].split(" ")
if len(state_zip) == 1:
state = state_zip[0].strip()
zip_code = None
elif len(state_zip) == 2:
state = state_zip[0].strip()
zip_code = state_zip[1].strip()
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
street_address, unit = parse_address_two(street_address)
return Address(
street_address=street_address,
city=city,
unit=unit,
state=state,
zip_code=zip_code,
country="USA",
) )
),
"max_unit_price": max(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
}
@staticmethod
def _extract_price(home: dict) -> dict:
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
price_per_square_foot = price // square_feet if square_feet and price else None
return {
k: v
for k, v in locals().items()
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
}
@staticmethod
def _extract_agent_name(home: dict) -> str | None:
broker_str = home.get("brokerName", "")
match = re.search(r"Listing by: (.+)", broker_str)
return match.group(1) if match else None
@staticmethod
def _parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
address_two = apt_match.group().strip() if apt_match else None
address_one = (
address_one.replace(address_two, "").strip() if address_two else address_one
)
return address_one, address_two
@staticmethod
def _extract_address(home: dict) -> Address:
keys = ("streetAddress", "city", "state", "zipcode")
address_one, city, state, zip_code = (
home["hdpData"]["homeInfo"][key] for key in keys
)
address_one, address_two = ZillowScraper._parse_address_two(address_one)
return Address(address_one, city, state, zip_code, address_two=address_two)
@staticmethod @staticmethod
def _get_headers(): def _get_headers():
return { return {
"authority": "parser-external.geo.moveaws.com", "authority": "www.zillow.com",
"accept": "*/*", "accept": "*/*",
"accept-language": "en-US,en;q=0.9", "accept-language": "en-US,en;q=0.9",
"content-type": "application/json",
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
"origin": "https://www.zillow.com", "origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/", "referer": "https://www.zillow.com",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0", "sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"', "sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty", "sec-fetch-dest": "empty",
"sec-fetch-mode": "cors", "sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
} }

View File

@@ -10,5 +10,5 @@ class NoResultsFound(Exception):
"""Raised when no results are found for the given location""" """Raised when no results are found for the given location"""
class PropertyNotFound(Exception): class GeoCoordsNotFound(Exception):
"""Raised when no property is found for the given address""" """Raised when no property is found for the given address"""

48
homeharvest/utils.py Normal file
View File

@@ -0,0 +1,48 @@
import re
def parse_address_two(street_address: str) -> tuple:
if not street_address:
return street_address, None
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
street_address,
re.I,
)
if apt_match:
apt_str = apt_match.group().strip()
cleaned_apt_str = re.sub(
r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I
)
main_address = street_address.replace(apt_str, "").strip()
return main_address, cleaned_apt_str
else:
return street_address, None
def parse_unit(street_address: str):
if not street_address:
return None
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
street_address,
re.I,
)
if apt_match:
apt_str = apt_match.group().strip()
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
return apt_str
else:
return None
if __name__ == "__main__":
print(parse_address_two("4303 E Cactus Rd Apt 126"))
print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A"))
print(parse_address_two("1234 Elm Street SuIte 3A"))

27
poetry.lock generated
View File

@@ -106,6 +106,17 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
] ]
[[package]]
name = "et-xmlfile"
version = "1.1.0"
description = "An implementation of lxml.xmlfile for the standard library"
optional = false
python-versions = ">=3.6"
files = [
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.1.3" version = "1.1.3"
@@ -217,6 +228,20 @@ files = [
{file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"}, {file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
] ]
[[package]]
name = "openpyxl"
version = "3.1.2"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
optional = false
python-versions = ">=3.6"
files = [
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]] [[package]]
name = "packaging" name = "packaging"
version = "23.1" version = "23.1"
@@ -425,4 +450,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "eede625d6d45085e143b0af246cb2ce00cff8579c667be3b63387c8594a5570d" content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f"

View File

@@ -1,15 +1,19 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.1.3" version = "0.2.5"
description = "Real estate scraping library" description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"
readme = "README.md" readme = "README.md"
[tool.poetry.scripts]
homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.0" pandas = "^2.1.0"
openpyxl = "^3.1.2"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@@ -1,12 +1,40 @@
from homeharvest import scrape_property from homeharvest import scrape_property
from homeharvest.exceptions import (
InvalidSite,
InvalidListingType,
NoResultsFound,
GeoCoordsNotFound,
)
def test_realtor(): def test_realtor():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"), scrape_property(
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format location="2530 Al Lipscomb Way",
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format site_name="realtor.com",
listing_type="for_sale",
),
scrape_property(
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
), #: does not support "city, state, USA" format
scrape_property(location="85281", site_name="realtor.com"), scrape_property(location="85281", site_name="realtor.com"),
] ]
assert all([result is not None for result in results]) assert all([result is not None for result in results])
bad_results = []
try:
bad_results += [
scrape_property(
location="abceefg ju098ot498hh9",
site_name="realtor.com",
listing_type="for_sale",
)
]
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
assert True
assert all([result is None for result in bad_results])

View File

@@ -1,12 +1,38 @@
from homeharvest import scrape_property from homeharvest import scrape_property
from homeharvest.exceptions import (
InvalidSite,
InvalidListingType,
NoResultsFound,
GeoCoordsNotFound,
)
def test_redfin(): def test_redfin():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"), location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
scrape_property(location="Dallas, TX, USA", site_name="redfin"), ),
scrape_property(
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
),
scrape_property(location="85281", site_name="redfin"), scrape_property(location="85281", site_name="redfin"),
] ]
assert all([result is not None for result in results]) assert all([result is not None for result in results])
bad_results = []
try:
bad_results += [
scrape_property(
location="abceefg ju098ot498hh9",
site_name="redfin",
listing_type="for_sale",
)
]
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
assert True
assert all([result is None for result in bad_results])

View File

@@ -1,12 +1,38 @@
from homeharvest import scrape_property from homeharvest import scrape_property
from homeharvest.exceptions import (
InvalidSite,
InvalidListingType,
NoResultsFound,
GeoCoordsNotFound,
)
def test_zillow(): def test_zillow():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"), location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
scrape_property(location="Dallas, TX, USA", site_name="zillow"), ),
scrape_property(
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
),
scrape_property(location="85281", site_name="zillow"), scrape_property(location="85281", site_name="zillow"),
] ]
assert all([result is not None for result in results]) assert all([result is not None for result in results])
bad_results = []
try:
bad_results += [
scrape_property(
location="abceefg ju098ot498hh9",
site_name="zillow",
listing_type="for_sale",
)
]
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
assert True
assert all([result is None for result in bad_results])