Compare commits

...

10 Commits

Author SHA1 Message Date
Cullen Watson
a433e46258 chore: update version number 2023-09-17 15:12:39 -05:00
Cullen Watson
df3519ae18 docs: add example 2023-09-17 15:10:21 -05:00
Cullen Watson
2f5ea1ca88 feat(scrapers): add zillow 2023-09-17 15:06:31 -05:00
Zachary Hampton
2f3b012747 - single address support 2023-09-16 14:34:10 -07:00
Zachary Hampton
5ea0fa0bdb - redfin city support
- test case updates
- types addition
- docs grammar
2023-09-16 13:39:03 -07:00
Zachary Hampton
2d6e746ae9 Create LICENSE 2023-09-16 10:39:36 -07:00
Zachary Hampton
a772fe45aa - rename to property 2023-09-16 10:11:39 -07:00
Zachary Hampton
4764b6bd37 Merge remote-tracking branch 'origin/master' 2023-09-15 20:59:03 -07:00
Zachary Hampton
0946abd35a - realtor init 2023-09-15 20:58:54 -07:00
Cullen Watson
0a2fb4cb31 docs: add roadmap 2023-09-15 21:47:46 -05:00
14 changed files with 486 additions and 59 deletions

5
.gitignore vendored
View File

@@ -1,2 +1,5 @@
/.idea
dist
**/dist/
**/__pycache__/
**/.pytest_cache/
*.pyc

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Zachary Hampton
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1 +1,35 @@
# HomeHarvest
**HomeHarvest** aims to be the top Python real estate scraping library.
## RoadMap
- **Supported Sites**: Currently, we support scraping from sites such as `Zillow` and `RedFin`.
- **Output**: Provides the option to return the scraped data as a Pandas dataframe.
- **Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience.
## Site Name Options
- `zillow`
- `redfin`
## Listing Types
- `for_rent`
- `for_sale`
### Installation
```bash
pip install --upgrade homeharvest
```
### Example Usage
```
from homeharvest import scrape_property
properties = scrape_property(
location="85281", site_name="zillow", listing_type="for_rent"
)
print(properties)
```

View File

@@ -1,24 +1,31 @@
from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.types import ListingType, Home
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType
from typing import Union
_scrapers = {
"redfin": RedfinScraper,
"realtor.com": RealtorScraper,
"zillow": ZillowScraper,
}
def scrape_property(
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
site_name: str = "redfin",
) -> list[Home]: #: eventually, return pandas dataframe
) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe
if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist."
)
scraper_input = ScraperInput(
location=location,

View File

@@ -1,6 +1,6 @@
from dataclasses import dataclass
import requests
from .types import Home, ListingType
from .models import Property, ListingType
@dataclass
@@ -11,9 +11,12 @@ class ScraperInput:
class Scraper:
listing_type = ListingType.FOR_SALE
def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location
self.session = requests.Session()
Scraper.listing_type = scraper_input.listing_type
if scraper_input.proxy_url:
self.session.proxies = {
@@ -21,7 +24,12 @@ class Scraper:
"https": scraper_input.proxy_url,
}
def search(self) -> list[Home]: ...
def search(self) -> list[Property]:
...
@staticmethod
def parse_home(home) -> Home: ...
def _parse_home(home) -> Property:
...
def handle_location(self):
...

View File

@@ -19,17 +19,34 @@ class Address:
@dataclass
class Home:
class Property:
address: Address
url: str
beds: int | None = None
baths: int | None = None
baths: float | None = None
stories: int | None = None
agent_name: str | None = None
description: str | None = None
year_built: int | None = None
square_feet: int | None = None
price_per_square_foot: int | None = None
year_built: int | None = None
price: int | None = None
mls_id: str | None = None
listing_type: ListingType | None = None
lot_size: int | None = None
description: str | None = None
@dataclass
class Building:
address: Address
url: str
num_units: int | None = None
min_unit_price: int | None = None
max_unit_price: int | None = None
avg_unit_price: int | None = None
listing_type: str | None = None

View File

@@ -0,0 +1,51 @@
import json
from ..models import Property, Address
from .. import Scraper
from typing import Any
class RealtorScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
def handle_location(self):
headers = {
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.realtor.com",
"referer": "https://www.realtor.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}
params = {
"input": self.location,
"client_id": "for-sale",
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
}
response = self.session.get(
"https://parser-external.geo.moveaws.com/suggest",
params=params,
headers=headers,
)
response_json = response.json()
return response_json["autocomplete"][0]
def search(self):
location_info = self.handle_location()
location_type = location_info["area_type"]
"""
property types:
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
"""
print("a")

View File

@@ -1,5 +1,5 @@
import json
from ..types import Home, Address
from ..models import Property, Address
from .. import Scraper
from typing import Any
@@ -8,56 +8,107 @@ class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
def handle_location(self):
url = 'https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}'.format(self.location)
def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
if response_json['payload']['exactMatch'] is not None:
return response_json['payload']['exactMatch']['id'].split('_')[1]
def get_region_type(match_type: str):
if match_type == "4":
return "2" #: zip
elif match_type == "2":
return "6" #: city
elif match_type == "1":
return "address" #: address, needs to be handled differently
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
else:
return response_json['payload']['sections'][0]['rows'][0].split('_')[1]
target = response_json["payload"]["sections"][0]["rows"][0]
return target["id"].split("_")[1], get_region_type(target["type"])
@staticmethod
def parse_home(home: dict) -> Home:
def _parse_home(home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and "value" in home[key]:
return home[key]["value"]
if not single_search:
address = Address(
address_one=home['streetLine']['value'],
city=home['city'],
state=home['state'],
zip_code=home['zip']
address_one=get_value("streetLine"),
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)
else:
address_info = home["streetAddress"]
address = Address(
address_one=address_info["assembledAddress"],
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)
url = 'https://www.redfin.com{}'.format(home['url'])
url = "https://www.redfin.com{}".format(home["url"])
def get_value(key: str) -> Any | None:
if key in home and 'value' in home[key]:
return home[key]['value']
return Home(
return Property(
address=address,
url=url,
beds=home['beds'] if 'beds' in home else None,
baths=home['baths'] if 'baths' in home else None,
stories=home['stories'] if 'stories' in home else None,
agent_name=get_value('listingAgent'),
description=home['listingRemarks'] if 'listingRemarks' in home else None,
year_built=get_value('yearBuilt'),
square_feet=get_value('sqFt'),
price_per_square_foot=get_value('pricePerSqFt'),
price=get_value('price'),
mls_id=get_value('mlsId')
beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt")
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
price_per_square_foot=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
)
def search(self):
region_id = self.handle_location()
def handle_address(self, home_id: str):
"""
EPs:
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
url = 'https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type=2'.format(region_id)
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))
homes = [self.parse_home(home) for home in response_json['payload']['homes']]
parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home]
def search(self):
region_id, region_type = self._handle_location()
if region_type == "address":
home_id = region_id
return self.handle_address(home_id)
url = "https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}".format(
region_id, region_type
)
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]
] #: support buildings
return homes

View File

@@ -0,0 +1,205 @@
import re
import json
from ..models import Property, Address, Building, ListingType
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper
class ZillowScraper(Scraper):
listing_type: ListingType.FOR_SALE
def __init__(self, scraper_input):
super().__init__(scraper_input)
if self.listing_type == ListingType.FOR_SALE:
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
elif self.listing_type == ListingType.FOR_RENT:
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
def search(self):
resp = self.session.get(self.url, headers=self._get_headers())
resp.raise_for_status()
content = resp.text
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
content,
re.DOTALL,
)
if not match:
raise NoResultsFound(
"No results were found for Zillow with the given Location."
)
json_str = match.group(1)
data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]:
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
"searchResults"
]["listResults"]
return [self._parse_home(house) for house in houses]
elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0]
property_data = gdp_client_cache[main_key]["property"]
property = self._get_single_property_page(property_data)
return [property]
raise PropertyNotFound("Specific property data not found in the response.")
@classmethod
def _parse_home(cls, home: dict):
"""
This method is used when a user enters a generic location & zillow returns more than one property
"""
url = (
f"https://www.zillow.com{home['detailUrl']}"
if "zillow.com" not in home["detailUrl"]
else home["detailUrl"]
)
if "hdpData" in home and "homeInfo" in home["hdpData"]:
price_data = cls._extract_price(home)
address = cls._extract_address(home)
agent_name = cls._extract_agent_name(home)
beds = home["hdpData"]["homeInfo"]["bedrooms"]
baths = home["hdpData"]["homeInfo"]["bathrooms"]
listing_type = home["hdpData"]["homeInfo"].get("homeType")
return Property(
address=address,
agent_name=agent_name,
url=url,
beds=beds,
baths=baths,
listing_type=listing_type,
**price_data,
)
else:
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
address_one, city, state, zip_code = (home[key] for key in keys)
address_one, address_two = cls._parse_address_two(address_one)
address = Address(address_one, city, state, zip_code, address_two)
building_info = cls._extract_building_info(home)
return Building(address=address, url=url, **building_info)
@classmethod
def _get_single_property_page(cls, property_data: dict):
"""
This method is used when a user enters the exact location & zillow returns just one property
"""
url = (
f"https://www.zillow.com{property_data['hdpUrl']}"
if "zillow.com" not in property_data["hdpUrl"]
else property_data["hdpUrl"]
)
address_data = property_data["address"]
address_one, address_two = cls._parse_address_two(address_data["streetAddress"])
address = Address(
address_one=address_one,
address_two=address_two,
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
)
return Property(
address=address,
url=url,
beds=property_data.get("bedrooms", None),
baths=property_data.get("bathrooms", None),
year_built=property_data.get("yearBuilt", None),
price=property_data.get("price", None),
lot_size=property_data.get("lotSize", None),
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
stories=property_data.get("resoFacts", {}).get("stories", None),
description=property_data.get("description", None),
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
price_per_square_foot=property_data.get("resoFacts", {}).get(
"pricePerSquareFoot", None
),
square_feet=property_data.get("livingArea", None),
listing_type=property_data.get("homeType", None),
)
@classmethod
def _extract_building_info(cls, home: dict) -> dict:
num_units = len(home["units"])
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
]
return {
"listing_type": cls.listing_type,
"num_units": len(home["units"]),
"min_unit_price": min(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"max_unit_price": max(
(
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in home["units"]
)
),
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
}
@staticmethod
def _extract_price(home: dict) -> dict:
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
price_per_square_foot = price // square_feet if square_feet and price else None
return {
k: v
for k, v in locals().items()
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
}
@staticmethod
def _extract_agent_name(home: dict) -> str | None:
broker_str = home.get("brokerName", "")
match = re.search(r"Listing by: (.+)", broker_str)
return match.group(1) if match else None
@staticmethod
def _parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
address_two = apt_match.group().strip() if apt_match else None
address_one = (
address_one.replace(address_two, "").strip() if address_two else address_one
)
return address_one, address_two
@staticmethod
def _extract_address(home: dict) -> Address:
keys = ("streetAddress", "city", "state", "zipcode")
address_one, city, state, zip_code = (
home["hdpData"]["homeInfo"][key] for key in keys
)
address_one, address_two = ZillowScraper._parse_address_two(address_one)
return Address(address_one, city, state, zip_code, address_two=address_two)
@staticmethod
def _get_headers():
return {
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}

View File

@@ -1,8 +1,14 @@
class InvalidSite(Exception):
"""Raised when a provided site is does not exist."""
pass
class InvalidListingType(Exception):
"""Raised when a provided listing type is does not exist."""
pass
class NoResultsFound(Exception):
"""Raised when no results are found for the given location"""
class PropertyNotFound(Exception):
"""Raised when no property is found for the given address"""

View File

@@ -1,8 +1,8 @@
[tool.poetry]
name = "homeharvest"
version = "0.1.1"
version = "0.1.2"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>"]
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
readme = "README.md"

9
tests/test_realtor.py Normal file
View File

@@ -0,0 +1,9 @@
from homeharvest import scrape_property
def test_realtor():
results = [
scrape_property(location="85281", site_name="realtor.com"),
]
assert all([result is not None for result in results])

View File

@@ -2,8 +2,11 @@ from homeharvest import scrape_property
def test_redfin():
result = scrape_property(
location="85281"
)
results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"),
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"),
scrape_property(location="Dallas, TX, USA", site_name="redfin"),
scrape_property(location="85281", site_name="redfin"),
]
assert result is not None
assert all([result is not None for result in results])

12
tests/test_zillow.py Normal file
View File

@@ -0,0 +1,12 @@
from homeharvest import scrape_property
def test_zillow():
results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"),
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"),
scrape_property(location="Dallas, TX, USA", site_name="zillow"),
scrape_property(location="85281", site_name="zillow"),
]
assert all([result is not None for result in results])