mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 12:04:31 -08:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b0e40df00a | ||
|
|
2fc40e0dad | ||
|
|
254f3a68a1 | ||
|
|
05713c76b0 | ||
|
|
9120cc9bfe | ||
|
|
eee4b19515 | ||
|
|
c25961eded | ||
|
|
0884c3d163 | ||
|
|
8f37bfdeb8 | ||
|
|
48c2338276 | ||
|
|
f58a1f4a74 | ||
|
|
4cef926d7d | ||
|
|
e82eeaa59f | ||
|
|
644f16b25b | ||
|
|
e9ddc6df92 | ||
|
|
50fb1c391d | ||
|
|
4f91f9dadb | ||
|
|
66e55173b1 | ||
|
|
f6054e8746 | ||
|
|
e8d9235ee6 | ||
|
|
043f091158 | ||
|
|
eae8108978 |
65
README.md
65
README.md
@@ -4,20 +4,26 @@
|
|||||||
|
|
||||||
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
||||||
|
|
||||||
|
\
|
||||||
|
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
|
||||||
|
|
||||||
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
||||||
|
|
||||||
|
Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/JobSpy)** – a Python package for job scraping*
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
||||||
- Aggregates the properties in a Pandas DataFrame
|
- Aggregates the properties in a Pandas DataFrame
|
||||||
|
|
||||||
[Video Guide for HomeHarvest](https://www.youtube.com/watch?v=HCoHoiJdWQY)
|
[Video Guide for HomeHarvest](https://youtu.be/JnV7eR2Ve2o) - _updated for release v0.2.7_
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install --force-reinstall homeharvest
|
pip install homeharvest
|
||||||
```
|
```
|
||||||
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
||||||
|
|
||||||
@@ -37,6 +43,7 @@ By default:
|
|||||||
- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
|
- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
|
||||||
- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
|
- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
|
||||||
- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
|
- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
|
||||||
|
- Use `-k` or `--keep_duplicates` to keep duplicate properties based on address. If not provided, duplicates will be removed.
|
||||||
### Python
|
### Python
|
||||||
|
|
||||||
```py
|
```py
|
||||||
@@ -71,8 +78,9 @@ Required
|
|||||||
├── location (str): address in various formats e.g. just zip, full address, city/state, etc.
|
├── location (str): address in various formats e.g. just zip, full address, city/state, etc.
|
||||||
└── listing_type (enum): for_rent, for_sale, sold
|
└── listing_type (enum): for_rent, for_sale, sold
|
||||||
Optional
|
Optional
|
||||||
├── site_name (List[enum], default=all three sites): zillow, realtor.com, redfin
|
├── site_name (list[enum], default=all three sites): zillow, realtor.com, redfin
|
||||||
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||||
|
└── keep_duplicates (bool, default=False): whether to keep or remove duplicate properties based on address
|
||||||
```
|
```
|
||||||
|
|
||||||
### Property Schema
|
### Property Schema
|
||||||
@@ -81,7 +89,7 @@ Property
|
|||||||
├── Basic Information:
|
├── Basic Information:
|
||||||
│ ├── property_url (str)
|
│ ├── property_url (str)
|
||||||
│ ├── site_name (enum): zillow, redfin, realtor.com
|
│ ├── site_name (enum): zillow, redfin, realtor.com
|
||||||
│ ├── listing_type (enum: ListingType)
|
│ ├── listing_type (enum): for_sale, for_rent, sold
|
||||||
│ └── property_type (enum): house, apartment, condo, townhouse, single_family, multi_family, building
|
│ └── property_type (enum): house, apartment, condo, townhouse, single_family, multi_family, building
|
||||||
|
|
||||||
├── Address Details:
|
├── Address Details:
|
||||||
@@ -92,45 +100,38 @@ Property
|
|||||||
│ ├── unit (str)
|
│ ├── unit (str)
|
||||||
│ └── country (str)
|
│ └── country (str)
|
||||||
|
|
||||||
├── Property Features:
|
├── House for Sale Features:
|
||||||
│ ├── price (int)
|
|
||||||
│ ├── tax_assessed_value (int)
|
│ ├── tax_assessed_value (int)
|
||||||
│ ├── currency (str)
|
|
||||||
│ ├── square_feet (int)
|
|
||||||
│ ├── beds (int)
|
|
||||||
│ ├── baths (float)
|
|
||||||
│ ├── lot_area_value (float)
|
│ ├── lot_area_value (float)
|
||||||
│ ├── lot_area_unit (str)
|
│ ├── lot_area_unit (str)
|
||||||
│ ├── stories (int)
|
│ ├── stories (int)
|
||||||
│ └── year_built (int)
|
│ ├── year_built (int)
|
||||||
|
│ └── price_per_sqft (int)
|
||||||
|
|
||||||
|
├── Building for Sale and Apartment Details:
|
||||||
|
│ ├── bldg_name (str)
|
||||||
|
│ ├── beds_min (int)
|
||||||
|
│ ├── beds_max (int)
|
||||||
|
│ ├── baths_min (float)
|
||||||
|
│ ├── baths_max (float)
|
||||||
|
│ ├── sqft_min (int)
|
||||||
|
│ ├── sqft_max (int)
|
||||||
|
│ ├── price_min (int)
|
||||||
|
│ ├── price_max (int)
|
||||||
|
│ ├── area_min (int)
|
||||||
|
│ └── unit_count (int)
|
||||||
|
|
||||||
├── Miscellaneous Details:
|
├── Miscellaneous Details:
|
||||||
│ ├── price_per_sqft (int)
|
|
||||||
│ ├── mls_id (str)
|
│ ├── mls_id (str)
|
||||||
│ ├── agent_name (str)
|
│ ├── agent_name (str)
|
||||||
│ ├── img_src (str)
|
│ ├── img_src (str)
|
||||||
│ ├── description (str)
|
│ ├── description (str)
|
||||||
│ ├── status_text (str)
|
│ ├── status_text (str)
|
||||||
│ ├── latitude (float)
|
│ └── posted_time (str)
|
||||||
│ ├── longitude (float)
|
|
||||||
│ └── posted_time (str) [Only for Zillow]
|
|
||||||
|
|
||||||
├── Building Details (for property_type: building):
|
└── Location Details:
|
||||||
│ ├── bldg_name (str)
|
├── latitude (float)
|
||||||
│ ├── bldg_unit_count (int)
|
└── longitude (float)
|
||||||
│ ├── bldg_min_beds (int)
|
|
||||||
│ ├── bldg_min_baths (float)
|
|
||||||
│ └── bldg_min_area (int)
|
|
||||||
|
|
||||||
└── Apartment Details (for property type: apartment):
|
|
||||||
├── apt_min_beds: int
|
|
||||||
├── apt_max_beds: int
|
|
||||||
├── apt_min_baths: float
|
|
||||||
├── apt_max_baths: float
|
|
||||||
├── apt_min_price: int
|
|
||||||
├── apt_max_price: int
|
|
||||||
├── apt_min_sqft: int
|
|
||||||
├── apt_max_sqft: int
|
|
||||||
```
|
```
|
||||||
## Supported Countries for Property Scraping
|
## Supported Countries for Property Scraping
|
||||||
|
|
||||||
@@ -144,7 +145,7 @@ The following exceptions may be raised when using HomeHarvest:
|
|||||||
- `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com`
|
- `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com`
|
||||||
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
|
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
|
||||||
- `NoResultsFound` - no properties found from your input
|
- `NoResultsFound` - no properties found from your input
|
||||||
- `GeoCoordsNotFound` - if Zillow scraper is not able to create geo-coordinates from the location you input
|
- `GeoCoordsNotFound` - if Zillow scraper is not able to derive geo-coordinates from the location you input
|
||||||
|
|
||||||
## Frequently Asked Questions
|
## Frequently Asked Questions
|
||||||
|
|
||||||
|
|||||||
@@ -23,9 +23,7 @@ def _validate_input(site_name: str, listing_type: str) -> None:
|
|||||||
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
||||||
|
|
||||||
if listing_type.upper() not in ListingType.__members__:
|
if listing_type.upper() not in ListingType.__members__:
|
||||||
raise InvalidListingType(
|
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
|
||||||
f"Provided listing type, '{listing_type}', does not exist."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_ordered_properties(result: Property) -> list[str]:
|
def _get_ordered_properties(result: Property) -> list[str]:
|
||||||
@@ -35,34 +33,26 @@ def _get_ordered_properties(result: Property) -> list[str]:
|
|||||||
"listing_type",
|
"listing_type",
|
||||||
"property_type",
|
"property_type",
|
||||||
"status_text",
|
"status_text",
|
||||||
"currency",
|
"baths_min",
|
||||||
"price",
|
"baths_max",
|
||||||
"apt_min_price",
|
"beds_min",
|
||||||
"apt_max_price",
|
"beds_max",
|
||||||
"apt_min_sqft",
|
"sqft_min",
|
||||||
"apt_max_sqft",
|
"sqft_max",
|
||||||
"apt_min_beds",
|
"price_min",
|
||||||
"apt_max_beds",
|
"price_max",
|
||||||
"apt_min_baths",
|
"unit_count",
|
||||||
"apt_max_baths",
|
|
||||||
"tax_assessed_value",
|
"tax_assessed_value",
|
||||||
"square_feet",
|
|
||||||
"price_per_sqft",
|
"price_per_sqft",
|
||||||
"beds",
|
|
||||||
"baths",
|
|
||||||
"lot_area_value",
|
"lot_area_value",
|
||||||
"lot_area_unit",
|
"lot_area_unit",
|
||||||
"street_address",
|
"address_one",
|
||||||
"unit",
|
"address_two",
|
||||||
"city",
|
"city",
|
||||||
"state",
|
"state",
|
||||||
"zip_code",
|
"zip_code",
|
||||||
"country",
|
|
||||||
"posted_time",
|
"posted_time",
|
||||||
"bldg_min_beds",
|
"area_min",
|
||||||
"bldg_min_baths",
|
|
||||||
"bldg_min_area",
|
|
||||||
"bldg_unit_count",
|
|
||||||
"bldg_name",
|
"bldg_name",
|
||||||
"stories",
|
"stories",
|
||||||
"year_built",
|
"year_built",
|
||||||
@@ -86,12 +76,11 @@ def _process_result(result: Property) -> pd.DataFrame:
|
|||||||
prop_data["property_type"] = None
|
prop_data["property_type"] = None
|
||||||
if "address" in prop_data:
|
if "address" in prop_data:
|
||||||
address_data = prop_data["address"]
|
address_data = prop_data["address"]
|
||||||
prop_data["street_address"] = address_data.street_address
|
prop_data["address_one"] = address_data.address_one
|
||||||
prop_data["unit"] = address_data.unit
|
prop_data["address_two"] = address_data.address_two
|
||||||
prop_data["city"] = address_data.city
|
prop_data["city"] = address_data.city
|
||||||
prop_data["state"] = address_data.state
|
prop_data["state"] = address_data.state
|
||||||
prop_data["zip_code"] = address_data.zip_code
|
prop_data["zip_code"] = address_data.zip_code
|
||||||
prop_data["country"] = address_data.country
|
|
||||||
|
|
||||||
del prop_data["address"]
|
del prop_data["address"]
|
||||||
|
|
||||||
@@ -101,9 +90,7 @@ def _process_result(result: Property) -> pd.DataFrame:
|
|||||||
return properties_df
|
return properties_df
|
||||||
|
|
||||||
|
|
||||||
def _scrape_single_site(
|
def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame:
|
||||||
location: str, site_name: str, listing_type: str, proxy: str = None
|
|
||||||
) -> pd.DataFrame:
|
|
||||||
"""
|
"""
|
||||||
Helper function to scrape a single site.
|
Helper function to scrape a single site.
|
||||||
"""
|
"""
|
||||||
@@ -120,9 +107,7 @@ def _scrape_single_site(
|
|||||||
results = site.search()
|
results = site.search()
|
||||||
|
|
||||||
properties_dfs = [_process_result(result) for result in results]
|
properties_dfs = [_process_result(result) for result in results]
|
||||||
properties_dfs = [
|
properties_dfs = [df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty]
|
||||||
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
|
|
||||||
]
|
|
||||||
if not properties_dfs:
|
if not properties_dfs:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
@@ -134,6 +119,7 @@ def scrape_property(
|
|||||||
site_name: Union[str, list[str]] = None,
|
site_name: Union[str, list[str]] = None,
|
||||||
listing_type: str = "for_sale",
|
listing_type: str = "for_sale",
|
||||||
proxy: str = None,
|
proxy: str = None,
|
||||||
|
keep_duplicates: bool = False
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Scrape property from various sites from a given location and listing type.
|
Scrape property from various sites from a given location and listing type.
|
||||||
@@ -158,9 +144,7 @@ def scrape_property(
|
|||||||
else:
|
else:
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
futures = {
|
futures = {
|
||||||
executor.submit(
|
executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name
|
||||||
_scrape_single_site, location, s_name, listing_type, proxy
|
|
||||||
): s_name
|
|
||||||
for s_name in site_name
|
for s_name in site_name
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,14 +159,13 @@ def scrape_property(
|
|||||||
|
|
||||||
final_df = pd.concat(results, ignore_index=True)
|
final_df = pd.concat(results, ignore_index=True)
|
||||||
|
|
||||||
columns_to_track = ["street_address", "city", "unit"]
|
columns_to_track = ["address_one", "address_two", "city"]
|
||||||
|
|
||||||
#: validate they exist, otherwise create them
|
#: validate they exist, otherwise create them
|
||||||
for col in columns_to_track:
|
for col in columns_to_track:
|
||||||
if col not in final_df.columns:
|
if col not in final_df.columns:
|
||||||
final_df[col] = None
|
final_df[col] = None
|
||||||
|
|
||||||
final_df = final_df.drop_duplicates(
|
if not keep_duplicates:
|
||||||
subset=["street_address", "city", "unit"], keep="first"
|
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
|
||||||
)
|
|
||||||
return final_df
|
return final_df
|
||||||
|
|||||||
@@ -5,9 +5,7 @@ from homeharvest import scrape_property
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
|
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
|
||||||
parser.add_argument(
|
parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")
|
||||||
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-s",
|
"-s",
|
||||||
@@ -45,14 +43,17 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
|
"-k",
|
||||||
|
"--keep_duplicates",
|
||||||
|
action="store_true",
|
||||||
|
help="Keep duplicate properties based on address"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
result = scrape_property(
|
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates)
|
||||||
args.location, args.site_name, args.listing_type, proxy=args.proxy
|
|
||||||
)
|
|
||||||
|
|
||||||
if not args.filename:
|
if not args.filename:
|
||||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|||||||
@@ -19,10 +19,7 @@ class Scraper:
|
|||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
if scraper_input.proxy:
|
if scraper_input.proxy:
|
||||||
proxy_url = scraper_input.proxy
|
proxy_url = scraper_input.proxy
|
||||||
proxies = {
|
proxies = {"http": proxy_url, "https": proxy_url}
|
||||||
"http": proxy_url,
|
|
||||||
"https": proxy_url
|
|
||||||
}
|
|
||||||
self.session.proxies.update(proxies)
|
self.session.proxies.update(proxies)
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
self.site_name = scraper_input.site_name
|
self.site_name = scraper_input.site_name
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
|
||||||
class SiteName(Enum):
|
class SiteName(Enum):
|
||||||
@@ -56,12 +57,11 @@ class PropertyType(Enum):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Address:
|
class Address:
|
||||||
street_address: str
|
address_one: str | None = None
|
||||||
city: str
|
address_two: str | None = "#"
|
||||||
state: str
|
city: str | None = None
|
||||||
zip_code: str
|
state: str | None = None
|
||||||
unit: str | None = None
|
zip_code: str | None = None
|
||||||
country: str | None = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -73,12 +73,7 @@ class Property:
|
|||||||
property_type: PropertyType | None = None
|
property_type: PropertyType | None = None
|
||||||
|
|
||||||
# house for sale
|
# house for sale
|
||||||
price: int | None = None
|
|
||||||
tax_assessed_value: int | None = None
|
tax_assessed_value: int | None = None
|
||||||
currency: str | None = None
|
|
||||||
square_feet: int | None = None
|
|
||||||
beds: int | None = None
|
|
||||||
baths: float | None = None
|
|
||||||
lot_area_value: float | None = None
|
lot_area_value: float | None = None
|
||||||
lot_area_unit: str | None = None
|
lot_area_unit: str | None = None
|
||||||
stories: int | None = None
|
stories: int | None = None
|
||||||
@@ -90,23 +85,25 @@ class Property:
|
|||||||
img_src: str | None = None
|
img_src: str | None = None
|
||||||
description: str | None = None
|
description: str | None = None
|
||||||
status_text: str | None = None
|
status_text: str | None = None
|
||||||
latitude: float | None = None
|
|
||||||
longitude: float | None = None
|
|
||||||
posted_time: str | None = None
|
posted_time: str | None = None
|
||||||
|
|
||||||
# building for sale
|
# building for sale
|
||||||
bldg_name: str | None = None
|
bldg_name: str | None = None
|
||||||
bldg_unit_count: int | None = None
|
area_min: int | None = None
|
||||||
bldg_min_beds: int | None = None
|
|
||||||
bldg_min_baths: float | None = None
|
|
||||||
bldg_min_area: int | None = None
|
|
||||||
|
|
||||||
# apt
|
beds_min: int | None = None
|
||||||
apt_min_beds: int | None = None
|
beds_max: int | None = None
|
||||||
apt_max_beds: int | None = None
|
|
||||||
apt_min_baths: float | None = None
|
baths_min: float | None = None
|
||||||
apt_max_baths: float | None = None
|
baths_max: float | None = None
|
||||||
apt_min_price: int | None = None
|
|
||||||
apt_max_price: int | None = None
|
sqft_min: int | None = None
|
||||||
apt_min_sqft: int | None = None
|
sqft_max: int | None = None
|
||||||
apt_max_sqft: int | None = None
|
|
||||||
|
price_min: int | None = None
|
||||||
|
price_max: int | None = None
|
||||||
|
|
||||||
|
unit_count: int | None = None
|
||||||
|
|
||||||
|
latitude: float | None = None
|
||||||
|
longitude: float | None = None
|
||||||
|
|||||||
@@ -1,16 +1,23 @@
|
|||||||
import json
|
"""
|
||||||
|
homeharvest.realtor.__init__
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module implements the scraper for relator.com
|
||||||
|
"""
|
||||||
from ..models import Property, Address
|
from ..models import Property, Address
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from typing import Any, Generator
|
|
||||||
from ....exceptions import NoResultsFound
|
from ....exceptions import NoResultsFound
|
||||||
from ....utils import parse_address_two, parse_unit
|
from ....utils import parse_address_one, parse_address_two
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
class RealtorScraper(Scraper):
|
class RealtorScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
|
self.counter = 1
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
|
self.search_url = (
|
||||||
|
"https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
|
||||||
|
)
|
||||||
|
|
||||||
def handle_location(self):
|
def handle_location(self):
|
||||||
headers = {
|
headers = {
|
||||||
@@ -50,6 +57,9 @@ class RealtorScraper(Scraper):
|
|||||||
return result[0]
|
return result[0]
|
||||||
|
|
||||||
def handle_address(self, property_id: str) -> list[Property]:
|
def handle_address(self, property_id: str) -> list[Property]:
|
||||||
|
"""
|
||||||
|
Handles a specific address & returns one property
|
||||||
|
"""
|
||||||
query = """query Property($property_id: ID!) {
|
query = """query Property($property_id: ID!) {
|
||||||
property(id: $property_id) {
|
property(id: $property_id) {
|
||||||
property_id
|
property_id
|
||||||
@@ -108,43 +118,45 @@ class RealtorScraper(Scraper):
|
|||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
property_info = response_json["data"]["property"]
|
property_info = response_json["data"]["property"]
|
||||||
street_address, unit = parse_address_two(property_info["address"]["line"])
|
address_one, address_two = parse_address_one(property_info["address"]["line"])
|
||||||
|
|
||||||
return [
|
return [
|
||||||
Property(
|
Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
address=Address(
|
address=Address(
|
||||||
street_address=street_address,
|
address_one=address_one,
|
||||||
|
address_two=address_two,
|
||||||
city=property_info["address"]["city"],
|
city=property_info["address"]["city"],
|
||||||
state=property_info["address"]["state_code"],
|
state=property_info["address"]["state_code"],
|
||||||
zip_code=property_info["address"]["postal_code"],
|
zip_code=property_info["address"]["postal_code"],
|
||||||
unit=unit,
|
|
||||||
country="USA",
|
|
||||||
),
|
),
|
||||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
+ property_info["details"]["permalink"],
|
+ property_info["details"]["permalink"],
|
||||||
beds=property_info["basic"]["beds"],
|
|
||||||
baths=property_info["basic"]["baths"],
|
|
||||||
stories=property_info["details"]["stories"],
|
stories=property_info["details"]["stories"],
|
||||||
year_built=property_info["details"]["year_built"],
|
year_built=property_info["details"]["year_built"],
|
||||||
square_feet=property_info["basic"]["sqft"],
|
price_per_sqft=property_info["basic"]["price"] // property_info["basic"]["sqft"]
|
||||||
price_per_sqft=property_info["basic"]["price"]
|
if property_info["basic"]["sqft"] is not None and property_info["basic"]["price"] is not None
|
||||||
// property_info["basic"]["sqft"]
|
|
||||||
if property_info["basic"]["sqft"] is not None
|
|
||||||
and property_info["basic"]["price"] is not None
|
|
||||||
else None,
|
else None,
|
||||||
price=property_info["basic"]["price"],
|
|
||||||
mls_id=property_id,
|
mls_id=property_id,
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
lot_area_value=property_info["public_record"]["lot_size"]
|
lot_area_value=property_info["public_record"]["lot_size"]
|
||||||
if property_info["public_record"] is not None
|
if property_info["public_record"] is not None
|
||||||
else None,
|
else None,
|
||||||
|
beds_min=property_info["basic"]["beds"],
|
||||||
|
beds_max=property_info["basic"]["beds"],
|
||||||
|
baths_min=property_info["basic"]["baths"],
|
||||||
|
baths_max=property_info["basic"]["baths"],
|
||||||
|
sqft_min=property_info["basic"]["sqft"],
|
||||||
|
sqft_max=property_info["basic"]["sqft"],
|
||||||
|
price_min=property_info["basic"]["price"],
|
||||||
|
price_max=property_info["basic"]["price"],
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
def handle_area(
|
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
|
||||||
self, variables: dict, return_total: bool = False
|
"""
|
||||||
) -> list[Property] | int:
|
Handles a location area & returns a list of properties
|
||||||
|
"""
|
||||||
query = (
|
query = (
|
||||||
"""query Home_search(
|
"""query Home_search(
|
||||||
$city: String,
|
$city: String,
|
||||||
@@ -237,17 +249,15 @@ class RealtorScraper(Scraper):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
for result in response_json["data"]["home_search"]["results"]:
|
for result in response_json["data"]["home_search"]["results"]:
|
||||||
street_address, unit = parse_address_two(
|
self.counter += 1
|
||||||
result["location"]["address"]["line"]
|
address_one, _ = parse_address_one(result["location"]["address"]["line"])
|
||||||
)
|
|
||||||
realty_property = Property(
|
realty_property = Property(
|
||||||
address=Address(
|
address=Address(
|
||||||
street_address=street_address,
|
address_one=address_one,
|
||||||
city=result["location"]["address"]["city"],
|
city=result["location"]["address"]["city"],
|
||||||
state=result["location"]["address"]["state_code"],
|
state=result["location"]["address"]["state_code"],
|
||||||
zip_code=result["location"]["address"]["postal_code"],
|
zip_code=result["location"]["address"]["postal_code"],
|
||||||
unit=parse_unit(result["location"]["address"]["unit"]),
|
address_two=parse_address_two(result["location"]["address"]["unit"]),
|
||||||
country="USA",
|
|
||||||
),
|
),
|
||||||
latitude=result["location"]["address"]["coordinate"]["lat"]
|
latitude=result["location"]["address"]["coordinate"]["lat"]
|
||||||
if result
|
if result
|
||||||
@@ -264,20 +274,22 @@ class RealtorScraper(Scraper):
|
|||||||
and "lon" in result["location"]["address"]["coordinate"]
|
and "lon" in result["location"]["address"]["coordinate"]
|
||||||
else None,
|
else None,
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"],
|
||||||
+ result["property_id"],
|
|
||||||
beds=result["description"]["beds"],
|
|
||||||
baths=result["description"]["baths"],
|
|
||||||
stories=result["description"]["stories"],
|
stories=result["description"]["stories"],
|
||||||
year_built=result["description"]["year_built"],
|
year_built=result["description"]["year_built"],
|
||||||
square_feet=result["description"]["sqft"],
|
|
||||||
price_per_sqft=result["price_per_sqft"],
|
price_per_sqft=result["price_per_sqft"],
|
||||||
price=result["list_price"],
|
|
||||||
mls_id=result["property_id"],
|
mls_id=result["property_id"],
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
lot_area_value=result["description"]["lot_sqft"],
|
lot_area_value=result["description"]["lot_sqft"],
|
||||||
|
beds_min=result["description"]["beds"],
|
||||||
|
beds_max=result["description"]["beds"],
|
||||||
|
baths_min=result["description"]["baths"],
|
||||||
|
baths_max=result["description"]["baths"],
|
||||||
|
sqft_min=result["description"]["sqft"],
|
||||||
|
sqft_max=result["description"]["sqft"],
|
||||||
|
price_min=result["list_price"],
|
||||||
|
price_max=result["list_price"],
|
||||||
)
|
)
|
||||||
|
|
||||||
properties.append(realty_property)
|
properties.append(realty_property)
|
||||||
|
|
||||||
return properties
|
return properties
|
||||||
|
|||||||
@@ -1,7 +1,13 @@
|
|||||||
|
"""
|
||||||
|
homeharvest.redfin.__init__
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module implements the scraper for redfin.com
|
||||||
|
"""
|
||||||
import json
|
import json
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two, parse_unit
|
from ....utils import parse_address_two, parse_address_one
|
||||||
from ..models import Property, Address, PropertyType, ListingType, SiteName
|
from ..models import Property, Address, PropertyType, ListingType, SiteName
|
||||||
from ....exceptions import NoResultsFound
|
from ....exceptions import NoResultsFound
|
||||||
|
|
||||||
@@ -12,9 +18,7 @@ class RedfinScraper(Scraper):
|
|||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
|
|
||||||
def _handle_location(self):
|
def _handle_location(self):
|
||||||
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
|
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(self.location)
|
||||||
self.location
|
|
||||||
)
|
|
||||||
|
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
@@ -28,9 +32,7 @@ class RedfinScraper(Scraper):
|
|||||||
return "address" #: address, needs to be handled differently
|
return "address" #: address, needs to be handled differently
|
||||||
|
|
||||||
if "exactMatch" not in response_json["payload"]:
|
if "exactMatch" not in response_json["payload"]:
|
||||||
raise NoResultsFound(
|
raise NoResultsFound("No results found for location: {}".format(self.location))
|
||||||
"No results found for location: {}".format(self.location)
|
|
||||||
)
|
|
||||||
|
|
||||||
if response_json["payload"]["exactMatch"] is not None:
|
if response_json["payload"]["exactMatch"] is not None:
|
||||||
target = response_json["payload"]["exactMatch"]
|
target = response_json["payload"]["exactMatch"]
|
||||||
@@ -45,39 +47,30 @@ class RedfinScraper(Scraper):
|
|||||||
return home[key]["value"]
|
return home[key]["value"]
|
||||||
|
|
||||||
if not single_search:
|
if not single_search:
|
||||||
street_address, unit = parse_address_two(get_value("streetLine"))
|
|
||||||
unit = parse_unit(get_value("streetLine"))
|
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=street_address,
|
address_one=parse_address_one(get_value("streetLine"))[0],
|
||||||
city=home["city"],
|
address_two=parse_address_one(get_value("streetLine"))[1],
|
||||||
state=home["state"],
|
city=home.get("city"),
|
||||||
zip_code=home["zip"],
|
state=home.get("state"),
|
||||||
unit=unit,
|
zip_code=home.get("zip"),
|
||||||
country="USA",
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
address_info = home["streetAddress"]
|
address_info = home.get("streetAddress")
|
||||||
street_address, unit = parse_address_two(address_info["assembledAddress"])
|
address_one, address_two = parse_address_one(address_info.get("assembledAddress"))
|
||||||
|
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=street_address,
|
address_one=address_one,
|
||||||
city=home["city"],
|
address_two=address_two,
|
||||||
state=home["state"],
|
city=home.get("city"),
|
||||||
zip_code=home["zip"],
|
state=home.get("state"),
|
||||||
unit=unit,
|
zip_code=home.get("zip"),
|
||||||
country="USA",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
url = "https://www.redfin.com{}".format(home["url"])
|
url = "https://www.redfin.com{}".format(home["url"])
|
||||||
#: property_type = home["propertyType"] if "propertyType" in home else None
|
|
||||||
lot_size_data = home.get("lotSize")
|
lot_size_data = home.get("lotSize")
|
||||||
|
|
||||||
if not isinstance(lot_size_data, int):
|
if not isinstance(lot_size_data, int):
|
||||||
lot_size = (
|
lot_size = lot_size_data.get("value", None) if isinstance(lot_size_data, dict) else None
|
||||||
lot_size_data.get("value", None)
|
|
||||||
if isinstance(lot_size_data, dict)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
lot_size = lot_size_data
|
lot_size = lot_size_data
|
||||||
|
|
||||||
@@ -86,26 +79,24 @@ class RedfinScraper(Scraper):
|
|||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
address=address,
|
address=address,
|
||||||
property_url=url,
|
property_url=url,
|
||||||
beds=home["beds"] if "beds" in home else None,
|
beds_min=home["beds"] if "beds" in home else None,
|
||||||
baths=home["baths"] if "baths" in home else None,
|
beds_max=home["beds"] if "beds" in home else None,
|
||||||
|
baths_min=home["baths"] if "baths" in home else None,
|
||||||
|
baths_max=home["baths"] if "baths" in home else None,
|
||||||
|
price_min=get_value("price"),
|
||||||
|
price_max=get_value("price"),
|
||||||
|
sqft_min=get_value("sqFt"),
|
||||||
|
sqft_max=get_value("sqFt"),
|
||||||
stories=home["stories"] if "stories" in home else None,
|
stories=home["stories"] if "stories" in home else None,
|
||||||
agent_name=get_value("listingAgent"),
|
agent_name=get_value("listingAgent"),
|
||||||
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
||||||
year_built=get_value("yearBuilt")
|
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
|
||||||
if not single_search
|
|
||||||
else home["yearBuilt"],
|
|
||||||
square_feet=get_value("sqFt"),
|
|
||||||
lot_area_value=lot_size,
|
lot_area_value=lot_size,
|
||||||
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
||||||
price_per_sqft=get_value("pricePerSqFt"),
|
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
|
||||||
price=get_value("price"),
|
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
latitude=home["latLong"]["latitude"]
|
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
|
||||||
if "latLong" in home and "latitude" in home["latLong"]
|
longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None,
|
||||||
else None,
|
|
||||||
longitude=home["latLong"]["longitude"]
|
|
||||||
if "latLong" in home and "longitude" in home["latLong"]
|
|
||||||
else None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _handle_rentals(self, region_id, region_type):
|
def _handle_rentals(self, region_id, region_type):
|
||||||
@@ -125,12 +116,10 @@ class RedfinScraper(Scraper):
|
|||||||
address_info = home_data.get("addressInfo", {})
|
address_info = home_data.get("addressInfo", {})
|
||||||
centroid = address_info.get("centroid", {}).get("centroid", {})
|
centroid = address_info.get("centroid", {}).get("centroid", {})
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=address_info.get("formattedStreetLine", None),
|
address_one=parse_address_one(address_info.get("formattedStreetLine"))[0],
|
||||||
city=address_info.get("city", None),
|
city=address_info.get("city"),
|
||||||
state=address_info.get("state", None),
|
state=address_info.get("state"),
|
||||||
zip_code=address_info.get("zip", None),
|
zip_code=address_info.get("zip"),
|
||||||
unit=None,
|
|
||||||
country="US" if address_info.get("countryCode", None) == 1 else None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
|
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
|
||||||
@@ -143,20 +132,20 @@ class RedfinScraper(Scraper):
|
|||||||
site_name=SiteName.REDFIN,
|
site_name=SiteName.REDFIN,
|
||||||
listing_type=ListingType.FOR_RENT,
|
listing_type=ListingType.FOR_RENT,
|
||||||
address=address,
|
address=address,
|
||||||
apt_min_beds=bed_range.get("min", None),
|
description=rental_data.get("description"),
|
||||||
apt_min_baths=bath_range.get("min", None),
|
latitude=centroid.get("latitude"),
|
||||||
apt_max_beds=bed_range.get("max", None),
|
longitude=centroid.get("longitude"),
|
||||||
apt_max_baths=bath_range.get("max", None),
|
baths_min=bath_range.get("min"),
|
||||||
description=rental_data.get("description", None),
|
baths_max=bath_range.get("max"),
|
||||||
latitude=centroid.get("latitude", None),
|
beds_min=bed_range.get("min"),
|
||||||
longitude=centroid.get("longitude", None),
|
beds_max=bed_range.get("max"),
|
||||||
apt_min_price=price_range.get("min", None),
|
price_min=price_range.get("min"),
|
||||||
apt_max_price=price_range.get("max", None),
|
price_max=price_range.get("max"),
|
||||||
apt_min_sqft=sqft_range.get("min", None),
|
sqft_min=sqft_range.get("min"),
|
||||||
apt_max_sqft=sqft_range.get("max", None),
|
sqft_max=sqft_range.get("max"),
|
||||||
img_src=home_data.get("staticMapUrl", None),
|
img_src=home_data.get("staticMapUrl"),
|
||||||
posted_time=rental_data.get("lastUpdated", None),
|
posted_time=rental_data.get("lastUpdated"),
|
||||||
bldg_name=rental_data.get("propertyName", None),
|
bldg_name=rental_data.get("propertyName"),
|
||||||
)
|
)
|
||||||
|
|
||||||
properties_list.append(property_)
|
properties_list.append(property_)
|
||||||
@@ -175,16 +164,15 @@ class RedfinScraper(Scraper):
|
|||||||
building["address"]["streetType"],
|
building["address"]["streetType"],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
street_address, unit = parse_address_two(street_address)
|
|
||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_type=PropertyType("BUILDING"),
|
property_type=PropertyType("BUILDING"),
|
||||||
address=Address(
|
address=Address(
|
||||||
street_address=street_address,
|
address_one=parse_address_one(street_address)[0],
|
||||||
city=building["address"]["city"],
|
city=building["address"]["city"],
|
||||||
state=building["address"]["stateOrProvinceCode"],
|
state=building["address"]["stateOrProvinceCode"],
|
||||||
zip_code=building["address"]["postalCode"],
|
zip_code=building["address"]["postalCode"],
|
||||||
unit=parse_unit(
|
address_two=parse_address_two(
|
||||||
" ".join(
|
" ".join(
|
||||||
[
|
[
|
||||||
building["address"]["unitType"],
|
building["address"]["unitType"],
|
||||||
@@ -195,7 +183,7 @@ class RedfinScraper(Scraper):
|
|||||||
),
|
),
|
||||||
property_url="https://www.redfin.com{}".format(building["url"]),
|
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
bldg_unit_count=building["numUnitsForSale"],
|
unit_count=building.get("numUnitsForSale"),
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_address(self, home_id: str):
|
def handle_address(self, home_id: str):
|
||||||
@@ -206,7 +194,6 @@ class RedfinScraper(Scraper):
|
|||||||
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
|
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
|
||||||
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
|
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
|
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
|
||||||
home_id
|
home_id
|
||||||
)
|
)
|
||||||
@@ -214,9 +201,7 @@ class RedfinScraper(Scraper):
|
|||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
|
|
||||||
parsed_home = self._parse_home(
|
parsed_home = self._parse_home(response_json["payload"]["addressSectionInfo"], single_search=True)
|
||||||
response_json["payload"]["addressSectionInfo"], single_search=True
|
|
||||||
)
|
|
||||||
return [parsed_home]
|
return [parsed_home]
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
@@ -235,10 +220,14 @@ class RedfinScraper(Scraper):
|
|||||||
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000"
|
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000"
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
homes = [
|
|
||||||
self._parse_home(home) for home in response_json["payload"]["homes"]
|
if "payload" in response_json:
|
||||||
] + [
|
homes_list = response_json["payload"].get("homes", [])
|
||||||
self._parse_building(building)
|
buildings_list = response_json["payload"].get("buildings", {}).values()
|
||||||
for building in response_json["payload"]["buildings"].values()
|
|
||||||
|
homes = [self._parse_home(home) for home in homes_list] + [
|
||||||
|
self._parse_building(building) for building in buildings_list
|
||||||
]
|
]
|
||||||
return homes
|
return homes
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|||||||
@@ -1,7 +1,13 @@
|
|||||||
|
"""
|
||||||
|
homeharvest.zillow.__init__
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module implements the scraper for zillow.com
|
||||||
|
"""
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two, parse_unit
|
from ....utils import parse_address_one, parse_address_two
|
||||||
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||||
from ..models import Property, Address, ListingType, PropertyType
|
from ..models import Property, Address, ListingType, PropertyType
|
||||||
|
|
||||||
@@ -9,16 +15,18 @@ from ..models import Property, Address, ListingType, PropertyType
|
|||||||
class ZillowScraper(Scraper):
|
class ZillowScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
|
self.cookies = None
|
||||||
|
|
||||||
if not self.is_plausible_location(self.location):
|
if not self.is_plausible_location(self.location):
|
||||||
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
||||||
|
|
||||||
if self.listing_type == ListingType.FOR_SALE:
|
listing_type_to_url_path = {
|
||||||
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
ListingType.FOR_SALE: "for_sale",
|
||||||
elif self.listing_type == ListingType.FOR_RENT:
|
ListingType.FOR_RENT: "for_rent",
|
||||||
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
|
ListingType.SOLD: "recently_sold",
|
||||||
else:
|
}
|
||||||
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
|
||||||
|
self.url = f"https://www.zillow.com/homes/{listing_type_to_url_path[self.listing_type]}/{self.location}_rb/"
|
||||||
|
|
||||||
def is_plausible_location(self, location: str) -> bool:
|
def is_plausible_location(self, location: str) -> bool:
|
||||||
url = (
|
url = (
|
||||||
@@ -31,9 +39,7 @@ class ZillowScraper(Scraper):
|
|||||||
return response.json()["results"] != []
|
return response.json()["results"] != []
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
resp = self.session.get(
|
resp = self.session.get(self.url, headers=self._get_headers())
|
||||||
self.url, headers=self._get_headers()
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
content = resp.text
|
content = resp.text
|
||||||
|
|
||||||
@@ -43,9 +49,7 @@ class ZillowScraper(Scraper):
|
|||||||
re.DOTALL,
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
if not match:
|
if not match:
|
||||||
raise NoResultsFound(
|
raise NoResultsFound("No results were found for Zillow with the given Location.")
|
||||||
"No results were found for Zillow with the given Location."
|
|
||||||
)
|
|
||||||
|
|
||||||
json_str = match.group(1)
|
json_str = match.group(1)
|
||||||
data = json.loads(json_str)
|
data = json.loads(json_str)
|
||||||
@@ -130,10 +134,9 @@ class ZillowScraper(Scraper):
|
|||||||
"wants": {"cat1": ["mapResults"]},
|
"wants": {"cat1": ["mapResults"]},
|
||||||
"isDebugRequest": False,
|
"isDebugRequest": False,
|
||||||
}
|
}
|
||||||
resp = self.session.put(
|
resp = self.session.put(url, headers=self._get_headers(), json=payload)
|
||||||
url, headers=self._get_headers(), json=payload
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
self.cookies = resp.cookies
|
||||||
a = resp.json()
|
a = resp.json()
|
||||||
return self._parse_properties(resp.json())
|
return self._parse_properties(resp.json())
|
||||||
|
|
||||||
@@ -146,87 +149,70 @@ class ZillowScraper(Scraper):
|
|||||||
if "hdpData" in result:
|
if "hdpData" in result:
|
||||||
home_info = result["hdpData"]["homeInfo"]
|
home_info = result["hdpData"]["homeInfo"]
|
||||||
address_data = {
|
address_data = {
|
||||||
"street_address": parse_address_two(home_info["streetAddress"])[0],
|
"address_one": parse_address_one(home_info.get("streetAddress"))[0],
|
||||||
"unit": parse_unit(home_info["unit"])
|
"address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#",
|
||||||
if "unit" in home_info
|
"city": home_info.get("city"),
|
||||||
else None,
|
"state": home_info.get("state"),
|
||||||
"city": home_info["city"],
|
"zip_code": home_info.get("zipcode"),
|
||||||
"state": home_info["state"],
|
|
||||||
"zip_code": home_info["zipcode"],
|
|
||||||
"country": home_info["country"],
|
|
||||||
}
|
}
|
||||||
property_data = {
|
property_obj = Property(
|
||||||
"site_name": self.site_name,
|
site_name=self.site_name,
|
||||||
"address": Address(**address_data),
|
address=Address(**address_data),
|
||||||
"property_url": f"https://www.zillow.com{result['detailUrl']}",
|
property_url=f"https://www.zillow.com{result['detailUrl']}",
|
||||||
"beds": int(home_info["bedrooms"])
|
tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None,
|
||||||
if "bedrooms" in home_info
|
property_type=PropertyType(home_info.get("homeType")),
|
||||||
else None,
|
listing_type=ListingType(
|
||||||
"baths": home_info.get("bathrooms"),
|
home_info["statusType"] if "statusType" in home_info else self.listing_type
|
||||||
"square_feet": int(home_info["livingArea"])
|
|
||||||
if "livingArea" in home_info
|
|
||||||
else None,
|
|
||||||
"currency": home_info["currency"],
|
|
||||||
"price": home_info.get("price"),
|
|
||||||
"tax_assessed_value": int(home_info["taxAssessedValue"])
|
|
||||||
if "taxAssessedValue" in home_info
|
|
||||||
else None,
|
|
||||||
"property_type": PropertyType(home_info["homeType"]),
|
|
||||||
"listing_type": ListingType(
|
|
||||||
home_info["statusType"]
|
|
||||||
if "statusType" in home_info
|
|
||||||
else self.listing_type
|
|
||||||
),
|
),
|
||||||
"lot_area_value": round(home_info["lotAreaValue"], 2)
|
status_text=result.get("statusText"),
|
||||||
if "lotAreaValue" in home_info
|
posted_time=result["variableData"]["text"]
|
||||||
else None,
|
|
||||||
"lot_area_unit": home_info.get("lotAreaUnit"),
|
|
||||||
"latitude": result["latLong"]["latitude"],
|
|
||||||
"longitude": result["latLong"]["longitude"],
|
|
||||||
"status_text": result.get("statusText"),
|
|
||||||
"posted_time": result["variableData"]["text"]
|
|
||||||
if "variableData" in result
|
if "variableData" in result
|
||||||
and "text" in result["variableData"]
|
and "text" in result["variableData"]
|
||||||
and result["variableData"]["type"] == "TIME_ON_INFO"
|
and result["variableData"]["type"] == "TIME_ON_INFO"
|
||||||
else None,
|
else None,
|
||||||
"img_src": result.get("imgSrc"),
|
price_min=home_info.get("price"),
|
||||||
"price_per_sqft": int(home_info["price"] // home_info["livingArea"])
|
price_max=home_info.get("price"),
|
||||||
if "livingArea" in home_info
|
beds_min=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
|
||||||
and home_info["livingArea"] != 0
|
beds_max=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
|
||||||
and "price" in home_info
|
baths_min=home_info.get("bathrooms"),
|
||||||
|
baths_max=home_info.get("bathrooms"),
|
||||||
|
sqft_min=int(home_info["livingArea"]) if "livingArea" in home_info else None,
|
||||||
|
sqft_max=int(home_info["livingArea"]) if "livingArea" in home_info else None,
|
||||||
|
price_per_sqft=int(home_info["price"] // home_info["livingArea"])
|
||||||
|
if "livingArea" in home_info and home_info["livingArea"] != 0 and "price" in home_info
|
||||||
else None,
|
else None,
|
||||||
}
|
latitude=result["latLong"]["latitude"],
|
||||||
property_obj = Property(**property_data)
|
longitude=result["latLong"]["longitude"],
|
||||||
|
lot_area_value=round(home_info["lotAreaValue"], 2) if "lotAreaValue" in home_info else None,
|
||||||
|
lot_area_unit=home_info.get("lotAreaUnit"),
|
||||||
|
img_src=result.get("imgSrc"),
|
||||||
|
)
|
||||||
|
|
||||||
properties_list.append(property_obj)
|
properties_list.append(property_obj)
|
||||||
|
|
||||||
elif "isBuilding" in result:
|
elif "isBuilding" in result:
|
||||||
price = result["price"]
|
price_string = result["price"].replace("$", "").replace(",", "").replace("+/mo", "")
|
||||||
building_data = {
|
|
||||||
"property_url": f"https://www.zillow.com{result['detailUrl']}",
|
match = re.search(r"(\d+)", price_string)
|
||||||
"site_name": self.site_name,
|
price_value = int(match.group(1)) if match else None
|
||||||
"property_type": PropertyType("BUILDING"),
|
building_obj = Property(
|
||||||
"listing_type": ListingType(result["statusType"]),
|
property_url=f"https://www.zillow.com{result['detailUrl']}",
|
||||||
"img_src": result["imgSrc"],
|
site_name=self.site_name,
|
||||||
"price": int(price.replace("From $", "").replace(",", ""))
|
property_type=PropertyType("BUILDING"),
|
||||||
if "From $" in price
|
listing_type=ListingType(result["statusType"]),
|
||||||
else None,
|
img_src=result.get("imgSrc"),
|
||||||
"apt_min_price": int(
|
address=self._extract_address(result["address"]),
|
||||||
price.replace("$", "").replace(",", "").replace("+/mo", "")
|
baths_min=result.get("minBaths"),
|
||||||
|
area_min=result.get("minArea"),
|
||||||
|
bldg_name=result.get("communityName"),
|
||||||
|
status_text=result.get("statusText"),
|
||||||
|
price_min=price_value if "+/mo" in result.get("price") else None,
|
||||||
|
price_max=price_value if "+/mo" in result.get("price") else None,
|
||||||
|
latitude=result.get("latLong", {}).get("latitude"),
|
||||||
|
longitude=result.get("latLong", {}).get("longitude"),
|
||||||
|
unit_count=result.get("unitCount"),
|
||||||
)
|
)
|
||||||
if "+/mo" in price
|
|
||||||
else None,
|
|
||||||
"address": self._extract_address(result["address"]),
|
|
||||||
"bldg_min_beds": result["minBeds"],
|
|
||||||
"currency": "USD",
|
|
||||||
"bldg_min_baths": result["minBaths"],
|
|
||||||
"bldg_min_area": result.get("minArea"),
|
|
||||||
"bldg_unit_count": result["unitCount"],
|
|
||||||
"bldg_name": result.get("communityName"),
|
|
||||||
"status_text": result["statusText"],
|
|
||||||
"latitude": result["latLong"]["latitude"],
|
|
||||||
"longitude": result["latLong"]["longitude"],
|
|
||||||
}
|
|
||||||
building_obj = Property(**building_data)
|
|
||||||
properties_list.append(building_obj)
|
properties_list.append(building_obj)
|
||||||
|
|
||||||
return properties_list
|
return properties_list
|
||||||
@@ -241,43 +227,41 @@ class ZillowScraper(Scraper):
|
|||||||
else property_data["hdpUrl"]
|
else property_data["hdpUrl"]
|
||||||
)
|
)
|
||||||
address_data = property_data["address"]
|
address_data = property_data["address"]
|
||||||
street_address, unit = parse_address_two(address_data["streetAddress"])
|
address_one, address_two = parse_address_one(address_data["streetAddress"])
|
||||||
address = Address(
|
address = Address(
|
||||||
street_address=street_address,
|
address_one=address_one,
|
||||||
unit=unit,
|
address_two=address_two if address_two else "#",
|
||||||
city=address_data["city"],
|
city=address_data["city"],
|
||||||
state=address_data["state"],
|
state=address_data["state"],
|
||||||
zip_code=address_data["zipcode"],
|
zip_code=address_data["zipcode"],
|
||||||
country=property_data.get("country"),
|
|
||||||
)
|
)
|
||||||
property_type = property_data.get("homeType", None)
|
property_type = property_data.get("homeType", None)
|
||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
address=address,
|
|
||||||
property_url=url,
|
property_url=url,
|
||||||
beds=property_data.get("bedrooms", None),
|
property_type=PropertyType(property_type),
|
||||||
baths=property_data.get("bathrooms", None),
|
listing_type=self.listing_type,
|
||||||
year_built=property_data.get("yearBuilt", None),
|
address=address,
|
||||||
price=property_data.get("price", None),
|
year_built=property_data.get("yearBuilt"),
|
||||||
tax_assessed_value=property_data.get("taxAssessedValue", None),
|
tax_assessed_value=property_data.get("taxAssessedValue"),
|
||||||
|
lot_area_value=property_data.get("lotAreaValue"),
|
||||||
|
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
|
||||||
|
agent_name=property_data.get("attributionInfo", {}).get("agentName"),
|
||||||
|
stories=property_data.get("resoFacts", {}).get("stories"),
|
||||||
|
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
|
||||||
|
beds_min=property_data.get("bedrooms"),
|
||||||
|
beds_max=property_data.get("bedrooms"),
|
||||||
|
baths_min=property_data.get("bathrooms"),
|
||||||
|
baths_max=property_data.get("bathrooms"),
|
||||||
|
price_min=property_data.get("price"),
|
||||||
|
price_max=property_data.get("price"),
|
||||||
|
sqft_min=property_data.get("livingArea"),
|
||||||
|
sqft_max=property_data.get("livingArea"),
|
||||||
|
price_per_sqft=property_data.get("resoFacts", {}).get("pricePerSquareFoot"),
|
||||||
latitude=property_data.get("latitude"),
|
latitude=property_data.get("latitude"),
|
||||||
longitude=property_data.get("longitude"),
|
longitude=property_data.get("longitude"),
|
||||||
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
|
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
|
||||||
currency=property_data.get("currency", None),
|
description=property_data.get("description"),
|
||||||
lot_area_value=property_data.get("lotAreaValue"),
|
|
||||||
lot_area_unit=property_data["lotAreaUnits"].lower()
|
|
||||||
if "lotAreaUnits" in property_data
|
|
||||||
else None,
|
|
||||||
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
|
|
||||||
stories=property_data.get("resoFacts", {}).get("stories", None),
|
|
||||||
description=property_data.get("description", None),
|
|
||||||
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
|
|
||||||
price_per_sqft=property_data.get("resoFacts", {}).get(
|
|
||||||
"pricePerSquareFoot", None
|
|
||||||
),
|
|
||||||
square_feet=property_data.get("livingArea", None),
|
|
||||||
property_type=PropertyType(property_type),
|
|
||||||
listing_type=self.listing_type,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_address(self, address_str):
|
def _extract_address(self, address_str):
|
||||||
@@ -290,7 +274,7 @@ class ZillowScraper(Scraper):
|
|||||||
if len(parts) != 3:
|
if len(parts) != 3:
|
||||||
raise ValueError(f"Unexpected address format: {address_str}")
|
raise ValueError(f"Unexpected address format: {address_str}")
|
||||||
|
|
||||||
street_address = parts[0].strip()
|
address_one = parts[0].strip()
|
||||||
city = parts[1].strip()
|
city = parts[1].strip()
|
||||||
state_zip = parts[2].split(" ")
|
state_zip = parts[2].split(" ")
|
||||||
|
|
||||||
@@ -303,24 +287,21 @@ class ZillowScraper(Scraper):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
|
||||||
|
|
||||||
street_address, unit = parse_address_two(street_address)
|
address_one, address_two = parse_address_one(address_one)
|
||||||
return Address(
|
return Address(
|
||||||
street_address=street_address,
|
address_one=address_one,
|
||||||
|
address_two=address_two if address_two else "#",
|
||||||
city=city,
|
city=city,
|
||||||
unit=unit,
|
|
||||||
state=state,
|
state=state,
|
||||||
zip_code=zip_code,
|
zip_code=zip_code,
|
||||||
country="USA",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
def _get_headers(self):
|
||||||
def _get_headers():
|
headers = {
|
||||||
return {
|
|
||||||
"authority": "www.zillow.com",
|
"authority": "www.zillow.com",
|
||||||
"accept": "*/*",
|
"accept": "*/*",
|
||||||
"accept-language": "en-US,en;q=0.9",
|
"accept-language": "en-US,en;q=0.9",
|
||||||
"content-type": "application/json",
|
"content-type": "application/json",
|
||||||
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
|
|
||||||
"origin": "https://www.zillow.com",
|
"origin": "https://www.zillow.com",
|
||||||
"referer": "https://www.zillow.com",
|
"referer": "https://www.zillow.com",
|
||||||
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
||||||
@@ -331,3 +312,6 @@ class ZillowScraper(Scraper):
|
|||||||
"sec-fetch-site": "same-origin",
|
"sec-fetch-site": "same-origin",
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||||
}
|
}
|
||||||
|
if self.cookies:
|
||||||
|
headers['Cookie'] = self.cookies
|
||||||
|
return headers
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def parse_address_two(street_address: str) -> tuple:
|
def parse_address_one(street_address: str) -> tuple:
|
||||||
if not street_address:
|
if not street_address:
|
||||||
return street_address, None
|
return street_address, "#"
|
||||||
|
|
||||||
apt_match = re.search(
|
apt_match = re.search(
|
||||||
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
|
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
|
||||||
@@ -13,36 +13,26 @@ def parse_address_two(street_address: str) -> tuple:
|
|||||||
|
|
||||||
if apt_match:
|
if apt_match:
|
||||||
apt_str = apt_match.group().strip()
|
apt_str = apt_match.group().strip()
|
||||||
cleaned_apt_str = re.sub(
|
cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I)
|
||||||
r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I
|
|
||||||
)
|
|
||||||
|
|
||||||
main_address = street_address.replace(apt_str, "").strip()
|
main_address = street_address.replace(apt_str, "").strip()
|
||||||
return main_address, cleaned_apt_str
|
return main_address, cleaned_apt_str
|
||||||
else:
|
else:
|
||||||
return street_address, None
|
return street_address, "#"
|
||||||
|
|
||||||
|
|
||||||
def parse_unit(street_address: str):
|
def parse_address_two(street_address: str):
|
||||||
if not street_address:
|
if not street_address:
|
||||||
return None
|
return "#"
|
||||||
apt_match = re.search(
|
apt_match = re.search(
|
||||||
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
|
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
|
||||||
street_address,
|
street_address,
|
||||||
re.I,
|
re.I,
|
||||||
)
|
)
|
||||||
|
|
||||||
if apt_match:
|
if apt_match:
|
||||||
apt_str = apt_match.group().strip()
|
apt_str = apt_match.group().strip()
|
||||||
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
|
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I)
|
||||||
return apt_str
|
return apt_str
|
||||||
else:
|
else:
|
||||||
return None
|
return "#"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(parse_address_two("4303 E Cactus Rd Apt 126"))
|
|
||||||
print(parse_address_two("1234 Elm Street apt 2B"))
|
|
||||||
print(parse_address_two("1234 Elm Street UNIT 3A"))
|
|
||||||
print(parse_address_two("1234 Elm Street unit 3A"))
|
|
||||||
print(parse_address_two("1234 Elm Street SuIte 3A"))
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.2.5"
|
version = "0.2.14"
|
||||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
|
|||||||
@@ -9,15 +9,9 @@ from homeharvest.exceptions import (
|
|||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(
|
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
|
||||||
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
|
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
|
||||||
),
|
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
|
||||||
scrape_property(
|
|
||||||
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
|
|
||||||
),
|
|
||||||
scrape_property(
|
|
||||||
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
|
|
||||||
),
|
|
||||||
scrape_property(location="85281", site_name="redfin"),
|
scrape_property(location="85281", site_name="redfin"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
24
tests/test_utils.py
Normal file
24
tests/test_utils.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from homeharvest.utils import parse_address_one, parse_address_two
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_address_one():
|
||||||
|
test_data = [
|
||||||
|
("4303 E Cactus Rd Apt 126", ("4303 E Cactus Rd", "#126")),
|
||||||
|
("1234 Elm Street apt 2B", ("1234 Elm Street", "#2B")),
|
||||||
|
("1234 Elm Street UNIT 3A", ("1234 Elm Street", "#3A")),
|
||||||
|
("1234 Elm Street unit 3A", ("1234 Elm Street", "#3A")),
|
||||||
|
("1234 Elm Street SuIte 3A", ("1234 Elm Street", "#3A")),
|
||||||
|
]
|
||||||
|
|
||||||
|
for input_data, (exp_addr_one, exp_addr_two) in test_data:
|
||||||
|
address_one, address_two = parse_address_one(input_data)
|
||||||
|
assert address_one == exp_addr_one
|
||||||
|
assert address_two == exp_addr_two
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_address_two():
|
||||||
|
test_data = [("Apt 126", "#126"), ("apt 2B", "#2B"), ("UNIT 3A", "#3A"), ("unit 3A", "#3A"), ("SuIte 3A", "#3A")]
|
||||||
|
|
||||||
|
for input_data, expected in test_data:
|
||||||
|
output = parse_address_two(input_data)
|
||||||
|
assert output == expected
|
||||||
@@ -9,15 +9,9 @@ from homeharvest.exceptions import (
|
|||||||
|
|
||||||
def test_zillow():
|
def test_zillow():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(
|
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"),
|
||||||
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
|
scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"),
|
||||||
),
|
scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"),
|
||||||
scrape_property(
|
|
||||||
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
|
|
||||||
),
|
|
||||||
scrape_property(
|
|
||||||
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
|
|
||||||
),
|
|
||||||
scrape_property(location="85281", site_name="zillow"),
|
scrape_property(location="85281", site_name="zillow"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user