mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 20:14:30 -08:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a433e46258 | ||
|
|
df3519ae18 | ||
|
|
2f5ea1ca88 | ||
|
|
2f3b012747 | ||
|
|
5ea0fa0bdb | ||
|
|
2d6e746ae9 | ||
|
|
a772fe45aa | ||
|
|
4764b6bd37 | ||
|
|
0946abd35a | ||
|
|
0a2fb4cb31 |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,2 +1,5 @@
|
|||||||
/.idea
|
/.idea
|
||||||
dist
|
**/dist/
|
||||||
|
**/__pycache__/
|
||||||
|
**/.pytest_cache/
|
||||||
|
*.pyc
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2023 Zachary Hampton
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
36
README.md
36
README.md
@@ -1 +1,35 @@
|
|||||||
# HomeHarvest
|
# HomeHarvest
|
||||||
|
|
||||||
|
**HomeHarvest** aims to be the top Python real estate scraping library.
|
||||||
|
|
||||||
|
## RoadMap
|
||||||
|
|
||||||
|
- **Supported Sites**: Currently, we support scraping from sites such as `Zillow` and `RedFin`.
|
||||||
|
- **Output**: Provides the option to return the scraped data as a Pandas dataframe.
|
||||||
|
- **Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience.
|
||||||
|
|
||||||
|
## Site Name Options
|
||||||
|
|
||||||
|
- `zillow`
|
||||||
|
- `redfin`
|
||||||
|
|
||||||
|
## Listing Types
|
||||||
|
|
||||||
|
- `for_rent`
|
||||||
|
- `for_sale`
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install --upgrade homeharvest
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Usage
|
||||||
|
```
|
||||||
|
from homeharvest import scrape_property
|
||||||
|
|
||||||
|
properties = scrape_property(
|
||||||
|
location="85281", site_name="zillow", listing_type="for_rent"
|
||||||
|
)
|
||||||
|
print(properties)
|
||||||
|
```
|
||||||
|
|||||||
@@ -1,24 +1,31 @@
|
|||||||
from .core.scrapers.redfin import RedfinScraper
|
from .core.scrapers.redfin import RedfinScraper
|
||||||
from .core.scrapers.types import ListingType, Home
|
from .core.scrapers.realtor import RealtorScraper
|
||||||
|
from .core.scrapers.zillow import ZillowScraper
|
||||||
|
from .core.scrapers.models import ListingType, Property, Building
|
||||||
from .core.scrapers import ScraperInput
|
from .core.scrapers import ScraperInput
|
||||||
from .exceptions import InvalidSite, InvalidListingType
|
from .exceptions import InvalidSite, InvalidListingType
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
|
||||||
_scrapers = {
|
_scrapers = {
|
||||||
"redfin": RedfinScraper,
|
"redfin": RedfinScraper,
|
||||||
|
"realtor.com": RealtorScraper,
|
||||||
|
"zillow": ZillowScraper,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def scrape_property(
|
def scrape_property(
|
||||||
location: str,
|
location: str,
|
||||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
site_name: str,
|
||||||
site_name: str = "redfin",
|
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
||||||
) -> list[Home]: #: eventually, return pandas dataframe
|
) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe
|
||||||
if site_name.lower() not in _scrapers:
|
if site_name.lower() not in _scrapers:
|
||||||
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
||||||
|
|
||||||
if listing_type.upper() not in ListingType.__members__:
|
if listing_type.upper() not in ListingType.__members__:
|
||||||
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
|
raise InvalidListingType(
|
||||||
|
f"Provided listing type, '{listing_type}', does not exist."
|
||||||
|
)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
location=location,
|
location=location,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import requests
|
import requests
|
||||||
from .types import Home, ListingType
|
from .models import Property, ListingType
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -11,9 +11,12 @@ class ScraperInput:
|
|||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
|
listing_type = ListingType.FOR_SALE
|
||||||
|
|
||||||
def __init__(self, scraper_input: ScraperInput):
|
def __init__(self, scraper_input: ScraperInput):
|
||||||
self.location = scraper_input.location
|
self.location = scraper_input.location
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
|
Scraper.listing_type = scraper_input.listing_type
|
||||||
|
|
||||||
if scraper_input.proxy_url:
|
if scraper_input.proxy_url:
|
||||||
self.session.proxies = {
|
self.session.proxies = {
|
||||||
@@ -21,7 +24,12 @@ class Scraper:
|
|||||||
"https": scraper_input.proxy_url,
|
"https": scraper_input.proxy_url,
|
||||||
}
|
}
|
||||||
|
|
||||||
def search(self) -> list[Home]: ...
|
def search(self) -> list[Property]:
|
||||||
|
...
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_home(home) -> Home: ...
|
def _parse_home(home) -> Property:
|
||||||
|
...
|
||||||
|
|
||||||
|
def handle_location(self):
|
||||||
|
...
|
||||||
|
|||||||
@@ -19,17 +19,34 @@ class Address:
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Home:
|
class Property:
|
||||||
address: Address
|
address: Address
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
beds: int | None = None
|
beds: int | None = None
|
||||||
baths: int | None = None
|
baths: float | None = None
|
||||||
stories: int | None = None
|
stories: int | None = None
|
||||||
agent_name: str | None = None
|
agent_name: str | None = None
|
||||||
description: str | None = None
|
|
||||||
year_built: int | None = None
|
year_built: int | None = None
|
||||||
square_feet: int | None = None
|
square_feet: int | None = None
|
||||||
price_per_square_foot: int | None = None
|
price_per_square_foot: int | None = None
|
||||||
|
year_built: int | None = None
|
||||||
price: int | None = None
|
price: int | None = None
|
||||||
mls_id: str | None = None
|
mls_id: str | None = None
|
||||||
|
|
||||||
|
listing_type: ListingType | None = None
|
||||||
|
lot_size: int | None = None
|
||||||
|
description: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Building:
|
||||||
|
address: Address
|
||||||
|
url: str
|
||||||
|
|
||||||
|
num_units: int | None = None
|
||||||
|
min_unit_price: int | None = None
|
||||||
|
max_unit_price: int | None = None
|
||||||
|
avg_unit_price: int | None = None
|
||||||
|
|
||||||
|
listing_type: str | None = None
|
||||||
51
homeharvest/core/scrapers/realtor/__init__.py
Normal file
51
homeharvest/core/scrapers/realtor/__init__.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
import json
|
||||||
|
from ..models import Property, Address
|
||||||
|
from .. import Scraper
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
class RealtorScraper(Scraper):
|
||||||
|
def __init__(self, scraper_input):
|
||||||
|
super().__init__(scraper_input)
|
||||||
|
|
||||||
|
def handle_location(self):
|
||||||
|
headers = {
|
||||||
|
"authority": "parser-external.geo.moveaws.com",
|
||||||
|
"accept": "*/*",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"origin": "https://www.realtor.com",
|
||||||
|
"referer": "https://www.realtor.com/",
|
||||||
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-platform": '"Windows"',
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-site": "cross-site",
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||||
|
}
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"input": self.location,
|
||||||
|
"client_id": "for-sale",
|
||||||
|
"limit": "1",
|
||||||
|
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = self.session.get(
|
||||||
|
"https://parser-external.geo.moveaws.com/suggest",
|
||||||
|
params=params,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
response_json = response.json()
|
||||||
|
|
||||||
|
return response_json["autocomplete"][0]
|
||||||
|
|
||||||
|
def search(self):
|
||||||
|
location_info = self.handle_location()
|
||||||
|
location_type = location_info["area_type"]
|
||||||
|
|
||||||
|
"""
|
||||||
|
property types:
|
||||||
|
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
|
||||||
|
"""
|
||||||
|
print("a")
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
from ..types import Home, Address
|
from ..models import Property, Address
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -8,56 +8,107 @@ class RedfinScraper(Scraper):
|
|||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
|
|
||||||
def handle_location(self):
|
def _handle_location(self):
|
||||||
url = 'https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}'.format(self.location)
|
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
|
||||||
|
self.location
|
||||||
|
)
|
||||||
|
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response_json = json.loads(response.text.replace('{}&&', ''))
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
|
|
||||||
if response_json['payload']['exactMatch'] is not None:
|
def get_region_type(match_type: str):
|
||||||
return response_json['payload']['exactMatch']['id'].split('_')[1]
|
if match_type == "4":
|
||||||
|
return "2" #: zip
|
||||||
|
elif match_type == "2":
|
||||||
|
return "6" #: city
|
||||||
|
elif match_type == "1":
|
||||||
|
return "address" #: address, needs to be handled differently
|
||||||
|
|
||||||
|
if response_json["payload"]["exactMatch"] is not None:
|
||||||
|
target = response_json["payload"]["exactMatch"]
|
||||||
else:
|
else:
|
||||||
return response_json['payload']['sections'][0]['rows'][0].split('_')[1]
|
target = response_json["payload"]["sections"][0]["rows"][0]
|
||||||
|
|
||||||
|
return target["id"].split("_")[1], get_region_type(target["type"])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_home(home: dict) -> Home:
|
def _parse_home(home: dict, single_search: bool = False) -> Property:
|
||||||
address = Address(
|
|
||||||
address_one=home['streetLine']['value'],
|
|
||||||
city=home['city'],
|
|
||||||
state=home['state'],
|
|
||||||
zip_code=home['zip']
|
|
||||||
)
|
|
||||||
|
|
||||||
url = 'https://www.redfin.com{}'.format(home['url'])
|
|
||||||
|
|
||||||
def get_value(key: str) -> Any | None:
|
def get_value(key: str) -> Any | None:
|
||||||
if key in home and 'value' in home[key]:
|
if key in home and "value" in home[key]:
|
||||||
return home[key]['value']
|
return home[key]["value"]
|
||||||
|
|
||||||
return Home(
|
if not single_search:
|
||||||
|
address = Address(
|
||||||
|
address_one=get_value("streetLine"),
|
||||||
|
city=home["city"],
|
||||||
|
state=home["state"],
|
||||||
|
zip_code=home["zip"],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
address_info = home["streetAddress"]
|
||||||
|
|
||||||
|
address = Address(
|
||||||
|
address_one=address_info["assembledAddress"],
|
||||||
|
city=home["city"],
|
||||||
|
state=home["state"],
|
||||||
|
zip_code=home["zip"],
|
||||||
|
)
|
||||||
|
|
||||||
|
url = "https://www.redfin.com{}".format(home["url"])
|
||||||
|
|
||||||
|
return Property(
|
||||||
address=address,
|
address=address,
|
||||||
url=url,
|
url=url,
|
||||||
beds=home['beds'] if 'beds' in home else None,
|
beds=home["beds"] if "beds" in home else None,
|
||||||
baths=home['baths'] if 'baths' in home else None,
|
baths=home["baths"] if "baths" in home else None,
|
||||||
stories=home['stories'] if 'stories' in home else None,
|
stories=home["stories"] if "stories" in home else None,
|
||||||
agent_name=get_value('listingAgent'),
|
agent_name=get_value("listingAgent"),
|
||||||
description=home['listingRemarks'] if 'listingRemarks' in home else None,
|
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
||||||
year_built=get_value('yearBuilt'),
|
year_built=get_value("yearBuilt")
|
||||||
square_feet=get_value('sqFt'),
|
if not single_search
|
||||||
price_per_square_foot=get_value('pricePerSqFt'),
|
else home["yearBuilt"],
|
||||||
price=get_value('price'),
|
square_feet=get_value("sqFt"),
|
||||||
mls_id=get_value('mlsId')
|
price_per_square_foot=get_value("pricePerSqFt"),
|
||||||
|
price=get_value("price"),
|
||||||
|
mls_id=get_value("mlsId"),
|
||||||
)
|
)
|
||||||
|
|
||||||
def search(self):
|
def handle_address(self, home_id: str):
|
||||||
region_id = self.handle_location()
|
"""
|
||||||
|
EPs:
|
||||||
|
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
|
||||||
|
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
|
||||||
|
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
|
||||||
|
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
|
||||||
|
"""
|
||||||
|
|
||||||
url = 'https://www.redfin.com/stingray/api/gis?al=1®ion_id={}®ion_type=2'.format(region_id)
|
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
|
||||||
|
home_id
|
||||||
|
)
|
||||||
|
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response_json = json.loads(response.text.replace('{}&&', ''))
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
|
|
||||||
homes = [self.parse_home(home) for home in response_json['payload']['homes']]
|
parsed_home = self._parse_home(
|
||||||
|
response_json["payload"]["addressSectionInfo"], single_search=True
|
||||||
|
)
|
||||||
|
return [parsed_home]
|
||||||
|
|
||||||
|
def search(self):
|
||||||
|
region_id, region_type = self._handle_location()
|
||||||
|
|
||||||
|
if region_type == "address":
|
||||||
|
home_id = region_id
|
||||||
|
return self.handle_address(home_id)
|
||||||
|
|
||||||
|
url = "https://www.redfin.com/stingray/api/gis?al=1®ion_id={}®ion_type={}".format(
|
||||||
|
region_id, region_type
|
||||||
|
)
|
||||||
|
|
||||||
|
response = self.session.get(url)
|
||||||
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
|
|
||||||
|
homes = [
|
||||||
|
self._parse_home(home) for home in response_json["payload"]["homes"]
|
||||||
|
] #: support buildings
|
||||||
return homes
|
return homes
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
205
homeharvest/core/scrapers/zillow/__init__.py
Normal file
205
homeharvest/core/scrapers/zillow/__init__.py
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
import re
|
||||||
|
import json
|
||||||
|
from ..models import Property, Address, Building, ListingType
|
||||||
|
from ....exceptions import NoResultsFound, PropertyNotFound
|
||||||
|
from .. import Scraper
|
||||||
|
|
||||||
|
|
||||||
|
class ZillowScraper(Scraper):
|
||||||
|
listing_type: ListingType.FOR_SALE
|
||||||
|
|
||||||
|
def __init__(self, scraper_input):
|
||||||
|
super().__init__(scraper_input)
|
||||||
|
if self.listing_type == ListingType.FOR_SALE:
|
||||||
|
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
||||||
|
elif self.listing_type == ListingType.FOR_RENT:
|
||||||
|
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
|
||||||
|
|
||||||
|
def search(self):
|
||||||
|
resp = self.session.get(self.url, headers=self._get_headers())
|
||||||
|
resp.raise_for_status()
|
||||||
|
content = resp.text
|
||||||
|
|
||||||
|
match = re.search(
|
||||||
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not match:
|
||||||
|
raise NoResultsFound(
|
||||||
|
"No results were found for Zillow with the given Location."
|
||||||
|
)
|
||||||
|
|
||||||
|
json_str = match.group(1)
|
||||||
|
data = json.loads(json_str)
|
||||||
|
|
||||||
|
if "searchPageState" in data["props"]["pageProps"]:
|
||||||
|
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
|
||||||
|
"searchResults"
|
||||||
|
]["listResults"]
|
||||||
|
return [self._parse_home(house) for house in houses]
|
||||||
|
elif "gdpClientCache" in data["props"]["pageProps"]:
|
||||||
|
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
|
||||||
|
main_key = list(gdp_client_cache.keys())[0]
|
||||||
|
|
||||||
|
property_data = gdp_client_cache[main_key]["property"]
|
||||||
|
property = self._get_single_property_page(property_data)
|
||||||
|
|
||||||
|
return [property]
|
||||||
|
raise PropertyNotFound("Specific property data not found in the response.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _parse_home(cls, home: dict):
|
||||||
|
"""
|
||||||
|
This method is used when a user enters a generic location & zillow returns more than one property
|
||||||
|
"""
|
||||||
|
url = (
|
||||||
|
f"https://www.zillow.com{home['detailUrl']}"
|
||||||
|
if "zillow.com" not in home["detailUrl"]
|
||||||
|
else home["detailUrl"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if "hdpData" in home and "homeInfo" in home["hdpData"]:
|
||||||
|
price_data = cls._extract_price(home)
|
||||||
|
address = cls._extract_address(home)
|
||||||
|
agent_name = cls._extract_agent_name(home)
|
||||||
|
beds = home["hdpData"]["homeInfo"]["bedrooms"]
|
||||||
|
baths = home["hdpData"]["homeInfo"]["bathrooms"]
|
||||||
|
listing_type = home["hdpData"]["homeInfo"].get("homeType")
|
||||||
|
|
||||||
|
return Property(
|
||||||
|
address=address,
|
||||||
|
agent_name=agent_name,
|
||||||
|
url=url,
|
||||||
|
beds=beds,
|
||||||
|
baths=baths,
|
||||||
|
listing_type=listing_type,
|
||||||
|
**price_data,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
|
||||||
|
address_one, city, state, zip_code = (home[key] for key in keys)
|
||||||
|
address_one, address_two = cls._parse_address_two(address_one)
|
||||||
|
address = Address(address_one, city, state, zip_code, address_two)
|
||||||
|
|
||||||
|
building_info = cls._extract_building_info(home)
|
||||||
|
return Building(address=address, url=url, **building_info)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_single_property_page(cls, property_data: dict):
|
||||||
|
"""
|
||||||
|
This method is used when a user enters the exact location & zillow returns just one property
|
||||||
|
"""
|
||||||
|
url = (
|
||||||
|
f"https://www.zillow.com{property_data['hdpUrl']}"
|
||||||
|
if "zillow.com" not in property_data["hdpUrl"]
|
||||||
|
else property_data["hdpUrl"]
|
||||||
|
)
|
||||||
|
address_data = property_data["address"]
|
||||||
|
address_one, address_two = cls._parse_address_two(address_data["streetAddress"])
|
||||||
|
address = Address(
|
||||||
|
address_one=address_one,
|
||||||
|
address_two=address_two,
|
||||||
|
city=address_data["city"],
|
||||||
|
state=address_data["state"],
|
||||||
|
zip_code=address_data["zipcode"],
|
||||||
|
)
|
||||||
|
|
||||||
|
return Property(
|
||||||
|
address=address,
|
||||||
|
url=url,
|
||||||
|
beds=property_data.get("bedrooms", None),
|
||||||
|
baths=property_data.get("bathrooms", None),
|
||||||
|
year_built=property_data.get("yearBuilt", None),
|
||||||
|
price=property_data.get("price", None),
|
||||||
|
lot_size=property_data.get("lotSize", None),
|
||||||
|
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
|
||||||
|
stories=property_data.get("resoFacts", {}).get("stories", None),
|
||||||
|
description=property_data.get("description", None),
|
||||||
|
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
|
||||||
|
price_per_square_foot=property_data.get("resoFacts", {}).get(
|
||||||
|
"pricePerSquareFoot", None
|
||||||
|
),
|
||||||
|
square_feet=property_data.get("livingArea", None),
|
||||||
|
listing_type=property_data.get("homeType", None),
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _extract_building_info(cls, home: dict) -> dict:
|
||||||
|
num_units = len(home["units"])
|
||||||
|
prices = [
|
||||||
|
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||||
|
for unit in home["units"]
|
||||||
|
]
|
||||||
|
return {
|
||||||
|
"listing_type": cls.listing_type,
|
||||||
|
"num_units": len(home["units"]),
|
||||||
|
"min_unit_price": min(
|
||||||
|
(
|
||||||
|
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||||
|
for unit in home["units"]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"max_unit_price": max(
|
||||||
|
(
|
||||||
|
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||||
|
for unit in home["units"]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_price(home: dict) -> dict:
|
||||||
|
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
|
||||||
|
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
|
||||||
|
|
||||||
|
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
|
||||||
|
price_per_square_foot = price // square_feet if square_feet and price else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
k: v
|
||||||
|
for k, v in locals().items()
|
||||||
|
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_agent_name(home: dict) -> str | None:
|
||||||
|
broker_str = home.get("brokerName", "")
|
||||||
|
match = re.search(r"Listing by: (.+)", broker_str)
|
||||||
|
return match.group(1) if match else None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_address_two(address_one: str):
|
||||||
|
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
|
||||||
|
address_two = apt_match.group().strip() if apt_match else None
|
||||||
|
address_one = (
|
||||||
|
address_one.replace(address_two, "").strip() if address_two else address_one
|
||||||
|
)
|
||||||
|
return address_one, address_two
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_address(home: dict) -> Address:
|
||||||
|
keys = ("streetAddress", "city", "state", "zipcode")
|
||||||
|
address_one, city, state, zip_code = (
|
||||||
|
home["hdpData"]["homeInfo"][key] for key in keys
|
||||||
|
)
|
||||||
|
address_one, address_two = ZillowScraper._parse_address_two(address_one)
|
||||||
|
return Address(address_one, city, state, zip_code, address_two=address_two)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_headers():
|
||||||
|
return {
|
||||||
|
"authority": "parser-external.geo.moveaws.com",
|
||||||
|
"accept": "*/*",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"origin": "https://www.zillow.com",
|
||||||
|
"referer": "https://www.zillow.com/",
|
||||||
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-platform": '"Windows"',
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-site": "cross-site",
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||||
|
}
|
||||||
@@ -1,8 +1,14 @@
|
|||||||
class InvalidSite(Exception):
|
class InvalidSite(Exception):
|
||||||
"""Raised when a provided site is does not exist."""
|
"""Raised when a provided site is does not exist."""
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidListingType(Exception):
|
class InvalidListingType(Exception):
|
||||||
"""Raised when a provided listing type is does not exist."""
|
"""Raised when a provided listing type is does not exist."""
|
||||||
pass
|
|
||||||
|
|
||||||
|
class NoResultsFound(Exception):
|
||||||
|
"""Raised when no results are found for the given location"""
|
||||||
|
|
||||||
|
|
||||||
|
class PropertyNotFound(Exception):
|
||||||
|
"""Raised when no property is found for the given address"""
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.1.1"
|
version = "0.1.2"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
|
|||||||
9
tests/test_realtor.py
Normal file
9
tests/test_realtor.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from homeharvest import scrape_property
|
||||||
|
|
||||||
|
|
||||||
|
def test_realtor():
|
||||||
|
results = [
|
||||||
|
scrape_property(location="85281", site_name="realtor.com"),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert all([result is not None for result in results])
|
||||||
@@ -2,8 +2,11 @@ from homeharvest import scrape_property
|
|||||||
|
|
||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
result = scrape_property(
|
results = [
|
||||||
location="85281"
|
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"),
|
||||||
)
|
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"),
|
||||||
|
scrape_property(location="Dallas, TX, USA", site_name="redfin"),
|
||||||
|
scrape_property(location="85281", site_name="redfin"),
|
||||||
|
]
|
||||||
|
|
||||||
assert result is not None
|
assert all([result is not None for result in results])
|
||||||
|
|||||||
12
tests/test_zillow.py
Normal file
12
tests/test_zillow.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from homeharvest import scrape_property
|
||||||
|
|
||||||
|
|
||||||
|
def test_zillow():
|
||||||
|
results = [
|
||||||
|
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"),
|
||||||
|
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"),
|
||||||
|
scrape_property(location="Dallas, TX, USA", site_name="zillow"),
|
||||||
|
scrape_property(location="85281", site_name="zillow"),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert all([result is not None for result in results])
|
||||||
Reference in New Issue
Block a user