reactor(redfin)

pull/1/head
Cullen Watson 2023-09-18 14:36:18 -05:00
commit ffd3ce6aed
9 changed files with 391 additions and 126 deletions

View File

@ -2,34 +2,41 @@
**HomeHarvest** aims to be the top Python real estate scraping library. **HomeHarvest** aims to be the top Python real estate scraping library.
## RoadMap _**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._
- **Supported Sites**: Currently, we support scraping from sites such as `Zillow` and `RedFin`. [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
- **Output**: Provides the option to return the scraped data as a Pandas dataframe.
- **Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience.
## Site Name Options ## Installation
- `zillow`
- `redfin`
## Listing Types
- `for_rent`
- `for_sale`
### Installation
```bash ```bash
pip install --upgrade homeharvest pip install --upgrade homeharvest
``` ```
### Example Usage ## Example Usage
``` ```py
from homeharvest import scrape_property >>> from homeharvest import scrape_property
... properties = scrape_property(
... location="85281", site_name="zillow", listing_type="for_rent"
... )
properties = scrape_property( >>> properties.head()
location="85281", site_name="zillow", listing_type="for_rent" address_one city ... mls_id description
) 0 420 N Scottsdale Rd Tempe ... NaN NaN
print(properties) 1 1255 E University Dr Tempe ... NaN NaN
2 1979 E Rio Salado Pkwy Tempe ... NaN NaN
3 548 S Wilson St Tempe ... None None
4 945 E Playa Del Norte Dr Unit 4027 Tempe ... NaN NaN
[5 rows x 23 columns]
``` ```
### Site Name Options
- `zillow`
- `redfin`
- `realtor.com`
### Listing Types
- `for_rent`
- `for_sale`
- `sold`

View File

@ -92,7 +92,17 @@ def scrape_property(
location: str, location: str,
site_name: str, site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> list[Property]: ) -> pd.DataFrame:
"""
Scrape property from various sites from a given location and listing type.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
"""
validate_input(site_name, listing_type) validate_input(site_name, listing_type)
scraper_input = ScraperInput( scraper_input = ScraperInput(

View File

@ -14,6 +14,8 @@ class ScraperInput:
class Scraper: class Scraper:
def __init__(self, scraper_input: ScraperInput): def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type
self.session = requests.Session() self.session = requests.Session()
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name

View File

@ -57,6 +57,7 @@ class Address:
country: str | None = None country: str | None = None
@dataclass @dataclass
class Property: class Property:
property_url: str property_url: str

View File

@ -1,12 +1,15 @@
import json import json
from ..models import Property, Address from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any from typing import Any, Generator
from ....exceptions import NoResultsFound
from concurrent.futures import ThreadPoolExecutor, as_completed
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
def handle_location(self): def handle_location(self):
headers = { headers = {
@ -26,7 +29,7 @@ class RealtorScraper(Scraper):
params = { params = {
"input": self.location, "input": self.location,
"client_id": "for-sale", "client_id": self.listing_type.value.replace('_', '-'),
"limit": "1", "limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
} }
@ -38,14 +41,228 @@ class RealtorScraper(Scraper):
) )
response_json = response.json() response_json = response.json()
return response_json["autocomplete"][0] result = response_json["autocomplete"]
if result is None:
raise NoResultsFound("No results found for location: " + self.location)
return result[0]
def handle_address(self, property_id: str) -> list[Property]:
query = """query Property($property_id: ID!) {
property(id: $property_id) {
property_id
details {
date_updated
garage
permalink
year_built
stories
}
address {
address_validation_code
city
country
county
line
postal_code
state_code
street_direction
street_name
street_number
street_suffix
street_post_direction
unit_value
unit
unit_descriptor
zip
}
basic {
baths
beds
price
sqft
lot_sqft
type
sold_price
}
public_record {
lot_size
sqft
stories
units
year_built
}
}
}"""
variables = {
'property_id': property_id
}
payload = {
'query': query,
'variables': variables,
}
response = self.session.post(self.search_url, json=payload)
response_json = response.json()
property_info = response_json['data']['property']
return [Property(
site_name=self.site_name,
address=Address(
address_one=property_info['address']['line'],
city=property_info['address']['city'],
state=property_info['address']['state_code'],
zip_code=property_info['address']['postal_code'],
),
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
beds=property_info['basic']['beds'],
baths=property_info['basic']['baths'],
stories=property_info['details']['stories'],
year_built=property_info['details']['year_built'],
square_feet=property_info['basic']['sqft'],
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
if property_info['basic']['sqft'] is not None and
property_info['basic']['price'] is not None
else None,
price=property_info['basic']['price'],
mls_id=property_id,
listing_type=self.listing_type,
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
)]
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
query = """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int,
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
}
limit: 200
offset: $offset
) {
count
total
results {
property_id
description {
baths
beds
lot_sqft
sqft
text
sold_price
stories
year_built
garage
unit_number
floor_number
}
location {
address {
city
country
line
postal_code
state_code
state
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
}
list_price
price_per_sqft
source {
id
}
}
}
}""" % self.listing_type.value
payload = {
'query': query,
'variables': variables,
}
response = self.session.post(self.search_url, json=payload)
response_json = response.json()
if return_total:
return response_json['data']['home_search']['total']
properties: list[Property] = []
for result in response_json['data']['home_search']['results']:
realty_property = Property(
address=Address(
address_one=result['location']['address']['line'],
city=result['location']['address']['city'],
state=result['location']['address']['state_code'],
zip_code=result['location']['address']['postal_code'],
address_two=result['location']['address']['unit'],
),
site_name=self.site_name,
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
beds=result['description']['beds'],
baths=result['description']['baths'],
stories=result['description']['stories'],
year_built=result['description']['year_built'],
square_feet=result['description']['sqft'],
price_per_square_foot=result['price_per_sqft'],
price=result['list_price'],
mls_id=result['property_id'],
listing_type=self.listing_type,
lot_size=result['description']['lot_sqft'],
)
properties.append(realty_property)
return properties
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]
""" if location_type == 'address':
property types: property_id = location_info['mpr_id']
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes return self.handle_address(property_id)
"""
print("a") offset = 0
search_variables = {
'city': location_info.get('city'),
'county': location_info.get('county'),
'state_code': location_info.get('state_code'),
'postal_code': location_info.get('postal_code'),
'offset': offset,
}
total = self.handle_area(search_variables, return_total=True)
homes = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
) for i in range(0, total, 200)
]
for future in as_completed(futures):
homes.extend(future.result())
return homes

View File

@ -93,6 +93,35 @@ class RedfinScraper(Scraper):
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
) )
def _parse_building(self, building: dict) -> Property:
return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address(
street_address=" ".join(
[
building['address']['streetNumber'],
building['address']['directionalPrefix'],
building['address']['streetName'],
building['address']['streetType'],
]
),
city=building['address']['city'],
state=building['address']['stateOrProvinceCode'],
zip_code=building['address']['postalCode'],
unit=" ".join(
[
building['address']['unitType'],
building['address']['unitValue'],
]
)
),
property_url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type,
bldg_unit_count=building["numUnitsForSale"],
)
def handle_address(self, home_id: str): def handle_address(self, home_id: str):
""" """
EPs: EPs:
@ -130,5 +159,8 @@ class RedfinScraper(Scraper):
homes = [ homes = [
self._parse_home(home) for home in response_json["payload"]["homes"] self._parse_home(home) for home in response_json["payload"]["homes"]
] #: support buildings ] + [
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
]
return homes return homes

View File

@ -117,11 +117,10 @@ class ZillowScraper(Scraper):
"isDebugRequest": False, "isDebugRequest": False,
} }
) )
print(payload)
resp = self.session.put(url, headers=self._get_headers(), data=payload) resp = self.session.put(url, headers=self._get_headers(), data=payload)
resp.raise_for_status() resp.raise_for_status()
a = resp.json() a = resp.json()
return parse_properties(resp.json()) return self._parse_properties(resp.json())
def _parse_properties(self, property_data: dict): def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"] mapresults = property_data["cat1"]["searchResults"]["mapResults"]
@ -129,98 +128,92 @@ class ZillowScraper(Scraper):
properties_list = [] properties_list = []
for result in mapresults: for result in mapresults:
try: if "hdpData" in result:
if "hdpData" in result: home_info = result["hdpData"]["homeInfo"]
home_info = result["hdpData"]["homeInfo"] address_data = {
address_data = { "street_address": home_info["streetAddress"],
"street_address": home_info["streetAddress"], "unit": home_info.get("unit"),
"unit": home_info.get("unit"), "city": home_info["city"],
"city": home_info["city"], "state": home_info["state"],
"state": home_info["state"], "zip_code": home_info["zipcode"],
"zip_code": home_info["zipcode"], "country": home_info["country"],
"country": home_info["country"], }
} property_data = {
property_data = { "site_name": self.site_name,
"site_name": self.site_name, "address": Address(**address_data),
"address": Address(**address_data), "property_url": f"https://www.zillow.com{result['detailUrl']}",
"property_url": f"https://www.zillow.com{result['detailUrl']}", "beds": int(home_info["bedrooms"])
"beds": int(home_info["bedrooms"]) if "bedrooms" in home_info
if "bedrooms" in home_info else None,
else None, "baths": home_info.get("bathrooms"),
"baths": home_info.get("bathrooms"), "square_feet": int(home_info["livingArea"])
"square_feet": int(home_info["livingArea"]) if "livingArea" in home_info
if "livingArea" in home_info else None,
else None, "currency": home_info["currency"],
"currency": home_info["currency"], "price": home_info.get("price"),
"price": home_info.get("price"), "square_feet": int(home_info["livingArea"])
"square_feet": int(home_info["livingArea"]) if "livingArea" in home_info
if "livingArea" in home_info else None,
else None, "tax_assessed_value": int(home_info["taxAssessedValue"])
"tax_assessed_value": int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info
if "taxAssessedValue" in home_info else None,
else None, "property_type": PropertyType(home_info["homeType"]),
"property_type": PropertyType(home_info["homeType"]), "listing_type": ListingType(
"listing_type": ListingType( home_info["statusType"]
home_info["statusType"] if "statusType" in home_info
if "statusType" in home_info else self.listing_type
else self.listing_type ),
), "lot_area_value": round(home_info["lotAreaValue"], 2)
"lot_area_value": round(home_info["lotAreaValue"], 2) if "lotAreaValue" in home_info
if "lotAreaValue" in home_info else None,
else None, "lot_area_unit": home_info.get("lotAreaUnit"),
"lot_area_unit": home_info.get("lotAreaUnit"), "latitude": result["latLong"]["latitude"],
"latitude": result["latLong"]["latitude"], "longitude": result["latLong"]["longitude"],
"longitude": result["latLong"]["longitude"], "status_text": result.get("statusText"),
"status_text": result.get("statusText"), "posted_time": result["variableData"]["text"]
"posted_time": result["variableData"]["text"] if "variableData" in result
if "variableData" in result and "text" in result["variableData"]
and "text" in result["variableData"] and result["variableData"]["type"] == "TIME_ON_INFO"
and result["variableData"]["type"] == "TIME_ON_INFO" else None,
else None, "img_src": result.get("imgSrc"),
"img_src": result.get("imgSrc"), "price_per_sqft": int(
"price_per_sqft": int( home_info["price"] // home_info["livingArea"]
home_info["price"] // home_info["livingArea"] )
) if "livingArea" in home_info and "price" in home_info
if "livingArea" in home_info and "price" in home_info else None,
else None, }
} property_obj = Property(**property_data)
property_obj = Property(**property_data) properties_list.append(property_obj)
properties_list.append(property_obj)
elif "isBuilding" in result: elif "isBuilding" in result:
price = result["price"] price = result["price"]
building_data = { building_data = {
"property_url": f"https://www.zillow.com{result['detailUrl']}", "property_url": f"https://www.zillow.com{result['detailUrl']}",
"site_name": self.site_name, "site_name": self.site_name,
"property_type": PropertyType("BUILDING"), "property_type": PropertyType("BUILDING"),
"listing_type": ListingType(result["statusType"]), "listing_type": ListingType(result["statusType"]),
"img_src": result["imgSrc"], "img_src": result["imgSrc"],
"price": int(price.replace("From $", "").replace(",", "")) "price": int(price.replace("From $", "").replace(",", ""))
if "From $" in price if "From $" in price
else None, else None,
"apt_min_price": int( "apt_min_price": int(
price.replace("$", "").replace(",", "").replace("+/mo", "") price.replace("$", "").replace(",", "").replace("+/mo", "")
) )
if "+/mo" in price if "+/mo" in price
else None, else None,
"address": self._extract_address(result["address"]), "address": self._extract_address(result["address"]),
"bldg_min_beds": result["minBeds"], "bldg_min_beds": result["minBeds"],
"currency": "USD", "currency": "USD",
"bldg_min_baths": result["minBaths"], "bldg_min_baths": result["minBaths"],
"bldg_min_area": result.get("minArea"), "bldg_min_area": result.get("minArea"),
"bldg_unit_count": result["unitCount"], "bldg_unit_count": result["unitCount"],
"bldg_name": result.get("communityName"), "bldg_name": result.get("communityName"),
"status_text": result["statusText"], "status_text": result["statusText"],
"latitude": result["latLong"]["latitude"], "latitude": result["latLong"]["latitude"],
"longitude": result["latLong"]["longitude"], "longitude": result["latLong"]["longitude"],
} }
building_obj = Property(**building_data) building_obj = Property(**building_data)
properties_list.append(building_obj) properties_list.append(building_obj)
except Exception as e:
print(home_info)
traceback.print_exc()
sys.exit()
return properties_list return properties_list

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.1.2" version = "0.1.3"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@ -3,6 +3,9 @@ from homeharvest import scrape_property
def test_realtor(): def test_realtor():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"),
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format
scrape_property(location="85281", site_name="realtor.com"), scrape_property(location="85281", site_name="realtor.com"),
] ]