reactor(redfin)
commit
ffd3ce6aed
51
README.md
51
README.md
|
@ -2,34 +2,41 @@
|
||||||
|
|
||||||
**HomeHarvest** aims to be the top Python real estate scraping library.
|
**HomeHarvest** aims to be the top Python real estate scraping library.
|
||||||
|
|
||||||
## RoadMap
|
_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._
|
||||||
|
|
||||||
- **Supported Sites**: Currently, we support scraping from sites such as `Zillow` and `RedFin`.
|
[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
||||||
- **Output**: Provides the option to return the scraped data as a Pandas dataframe.
|
|
||||||
- **Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience.
|
|
||||||
|
|
||||||
## Site Name Options
|
## Installation
|
||||||
|
|
||||||
- `zillow`
|
|
||||||
- `redfin`
|
|
||||||
|
|
||||||
## Listing Types
|
|
||||||
|
|
||||||
- `for_rent`
|
|
||||||
- `for_sale`
|
|
||||||
|
|
||||||
### Installation
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install --upgrade homeharvest
|
pip install --upgrade homeharvest
|
||||||
```
|
```
|
||||||
|
|
||||||
### Example Usage
|
## Example Usage
|
||||||
```
|
```py
|
||||||
from homeharvest import scrape_property
|
>>> from homeharvest import scrape_property
|
||||||
|
... properties = scrape_property(
|
||||||
|
... location="85281", site_name="zillow", listing_type="for_rent"
|
||||||
|
... )
|
||||||
|
|
||||||
properties = scrape_property(
|
>>> properties.head()
|
||||||
location="85281", site_name="zillow", listing_type="for_rent"
|
address_one city ... mls_id description
|
||||||
)
|
0 420 N Scottsdale Rd Tempe ... NaN NaN
|
||||||
print(properties)
|
1 1255 E University Dr Tempe ... NaN NaN
|
||||||
|
2 1979 E Rio Salado Pkwy Tempe ... NaN NaN
|
||||||
|
3 548 S Wilson St Tempe ... None None
|
||||||
|
4 945 E Playa Del Norte Dr Unit 4027 Tempe ... NaN NaN
|
||||||
|
[5 rows x 23 columns]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Site Name Options
|
||||||
|
|
||||||
|
- `zillow`
|
||||||
|
- `redfin`
|
||||||
|
- `realtor.com`
|
||||||
|
|
||||||
|
### Listing Types
|
||||||
|
|
||||||
|
- `for_rent`
|
||||||
|
- `for_sale`
|
||||||
|
- `sold`
|
||||||
|
|
|
@ -92,7 +92,17 @@ def scrape_property(
|
||||||
location: str,
|
location: str,
|
||||||
site_name: str,
|
site_name: str,
|
||||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
||||||
) -> list[Property]:
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Scrape property from various sites from a given location and listing type.
|
||||||
|
|
||||||
|
:returns: pd.DataFrame
|
||||||
|
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
|
||||||
|
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
|
||||||
|
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
|
||||||
|
:return: pd.DataFrame containing properties
|
||||||
|
"""
|
||||||
|
|
||||||
validate_input(site_name, listing_type)
|
validate_input(site_name, listing_type)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
|
|
|
@ -14,6 +14,8 @@ class ScraperInput:
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, scraper_input: ScraperInput):
|
def __init__(self, scraper_input: ScraperInput):
|
||||||
self.location = scraper_input.location
|
self.location = scraper_input.location
|
||||||
|
self.listing_type = scraper_input.listing_type
|
||||||
|
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
self.site_name = scraper_input.site_name
|
self.site_name = scraper_input.site_name
|
||||||
|
|
|
@ -57,6 +57,7 @@ class Address:
|
||||||
country: str | None = None
|
country: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Property:
|
class Property:
|
||||||
property_url: str
|
property_url: str
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
import json
|
import json
|
||||||
from ..models import Property, Address
|
from ..models import Property, Address
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from typing import Any
|
from typing import Any, Generator
|
||||||
|
from ....exceptions import NoResultsFound
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
class RealtorScraper(Scraper):
|
class RealtorScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
|
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
|
||||||
|
|
||||||
def handle_location(self):
|
def handle_location(self):
|
||||||
headers = {
|
headers = {
|
||||||
|
@ -26,7 +29,7 @@ class RealtorScraper(Scraper):
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"input": self.location,
|
"input": self.location,
|
||||||
"client_id": "for-sale",
|
"client_id": self.listing_type.value.replace('_', '-'),
|
||||||
"limit": "1",
|
"limit": "1",
|
||||||
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
||||||
}
|
}
|
||||||
|
@ -38,14 +41,228 @@ class RealtorScraper(Scraper):
|
||||||
)
|
)
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
return response_json["autocomplete"][0]
|
result = response_json["autocomplete"]
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
raise NoResultsFound("No results found for location: " + self.location)
|
||||||
|
|
||||||
|
return result[0]
|
||||||
|
|
||||||
|
def handle_address(self, property_id: str) -> list[Property]:
|
||||||
|
query = """query Property($property_id: ID!) {
|
||||||
|
property(id: $property_id) {
|
||||||
|
property_id
|
||||||
|
details {
|
||||||
|
date_updated
|
||||||
|
garage
|
||||||
|
permalink
|
||||||
|
year_built
|
||||||
|
stories
|
||||||
|
}
|
||||||
|
address {
|
||||||
|
address_validation_code
|
||||||
|
city
|
||||||
|
country
|
||||||
|
county
|
||||||
|
line
|
||||||
|
postal_code
|
||||||
|
state_code
|
||||||
|
street_direction
|
||||||
|
street_name
|
||||||
|
street_number
|
||||||
|
street_suffix
|
||||||
|
street_post_direction
|
||||||
|
unit_value
|
||||||
|
unit
|
||||||
|
unit_descriptor
|
||||||
|
zip
|
||||||
|
}
|
||||||
|
basic {
|
||||||
|
baths
|
||||||
|
beds
|
||||||
|
price
|
||||||
|
sqft
|
||||||
|
lot_sqft
|
||||||
|
type
|
||||||
|
sold_price
|
||||||
|
}
|
||||||
|
public_record {
|
||||||
|
lot_size
|
||||||
|
sqft
|
||||||
|
stories
|
||||||
|
units
|
||||||
|
year_built
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
|
||||||
|
variables = {
|
||||||
|
'property_id': property_id
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'query': query,
|
||||||
|
'variables': variables,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = self.session.post(self.search_url, json=payload)
|
||||||
|
response_json = response.json()
|
||||||
|
|
||||||
|
property_info = response_json['data']['property']
|
||||||
|
|
||||||
|
return [Property(
|
||||||
|
site_name=self.site_name,
|
||||||
|
address=Address(
|
||||||
|
address_one=property_info['address']['line'],
|
||||||
|
city=property_info['address']['city'],
|
||||||
|
state=property_info['address']['state_code'],
|
||||||
|
zip_code=property_info['address']['postal_code'],
|
||||||
|
),
|
||||||
|
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
|
||||||
|
beds=property_info['basic']['beds'],
|
||||||
|
baths=property_info['basic']['baths'],
|
||||||
|
stories=property_info['details']['stories'],
|
||||||
|
year_built=property_info['details']['year_built'],
|
||||||
|
square_feet=property_info['basic']['sqft'],
|
||||||
|
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
|
||||||
|
if property_info['basic']['sqft'] is not None and
|
||||||
|
property_info['basic']['price'] is not None
|
||||||
|
else None,
|
||||||
|
price=property_info['basic']['price'],
|
||||||
|
mls_id=property_id,
|
||||||
|
listing_type=self.listing_type,
|
||||||
|
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
|
||||||
|
)]
|
||||||
|
|
||||||
|
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
|
||||||
|
query = """query Home_search(
|
||||||
|
$city: String,
|
||||||
|
$county: [String],
|
||||||
|
$state_code: String,
|
||||||
|
$postal_code: String
|
||||||
|
$offset: Int,
|
||||||
|
) {
|
||||||
|
home_search(
|
||||||
|
query: {
|
||||||
|
city: $city
|
||||||
|
county: $county
|
||||||
|
postal_code: $postal_code
|
||||||
|
state_code: $state_code
|
||||||
|
status: %s
|
||||||
|
}
|
||||||
|
limit: 200
|
||||||
|
offset: $offset
|
||||||
|
) {
|
||||||
|
count
|
||||||
|
total
|
||||||
|
results {
|
||||||
|
property_id
|
||||||
|
description {
|
||||||
|
baths
|
||||||
|
beds
|
||||||
|
lot_sqft
|
||||||
|
sqft
|
||||||
|
text
|
||||||
|
sold_price
|
||||||
|
stories
|
||||||
|
year_built
|
||||||
|
garage
|
||||||
|
unit_number
|
||||||
|
floor_number
|
||||||
|
}
|
||||||
|
location {
|
||||||
|
address {
|
||||||
|
city
|
||||||
|
country
|
||||||
|
line
|
||||||
|
postal_code
|
||||||
|
state_code
|
||||||
|
state
|
||||||
|
street_direction
|
||||||
|
street_name
|
||||||
|
street_number
|
||||||
|
street_post_direction
|
||||||
|
street_suffix
|
||||||
|
unit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
list_price
|
||||||
|
price_per_sqft
|
||||||
|
source {
|
||||||
|
id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}""" % self.listing_type.value
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'query': query,
|
||||||
|
'variables': variables,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = self.session.post(self.search_url, json=payload)
|
||||||
|
response_json = response.json()
|
||||||
|
|
||||||
|
if return_total:
|
||||||
|
return response_json['data']['home_search']['total']
|
||||||
|
|
||||||
|
properties: list[Property] = []
|
||||||
|
|
||||||
|
for result in response_json['data']['home_search']['results']:
|
||||||
|
realty_property = Property(
|
||||||
|
address=Address(
|
||||||
|
address_one=result['location']['address']['line'],
|
||||||
|
city=result['location']['address']['city'],
|
||||||
|
state=result['location']['address']['state_code'],
|
||||||
|
zip_code=result['location']['address']['postal_code'],
|
||||||
|
address_two=result['location']['address']['unit'],
|
||||||
|
),
|
||||||
|
site_name=self.site_name,
|
||||||
|
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
|
||||||
|
beds=result['description']['beds'],
|
||||||
|
baths=result['description']['baths'],
|
||||||
|
stories=result['description']['stories'],
|
||||||
|
year_built=result['description']['year_built'],
|
||||||
|
square_feet=result['description']['sqft'],
|
||||||
|
price_per_square_foot=result['price_per_sqft'],
|
||||||
|
price=result['list_price'],
|
||||||
|
mls_id=result['property_id'],
|
||||||
|
listing_type=self.listing_type,
|
||||||
|
lot_size=result['description']['lot_sqft'],
|
||||||
|
)
|
||||||
|
|
||||||
|
properties.append(realty_property)
|
||||||
|
|
||||||
|
return properties
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
location_info = self.handle_location()
|
location_info = self.handle_location()
|
||||||
location_type = location_info["area_type"]
|
location_type = location_info["area_type"]
|
||||||
|
|
||||||
"""
|
if location_type == 'address':
|
||||||
property types:
|
property_id = location_info['mpr_id']
|
||||||
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
|
return self.handle_address(property_id)
|
||||||
"""
|
|
||||||
print("a")
|
offset = 0
|
||||||
|
search_variables = {
|
||||||
|
'city': location_info.get('city'),
|
||||||
|
'county': location_info.get('county'),
|
||||||
|
'state_code': location_info.get('state_code'),
|
||||||
|
'postal_code': location_info.get('postal_code'),
|
||||||
|
'offset': offset,
|
||||||
|
}
|
||||||
|
|
||||||
|
total = self.handle_area(search_variables, return_total=True)
|
||||||
|
|
||||||
|
homes = []
|
||||||
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(
|
||||||
|
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
|
||||||
|
) for i in range(0, total, 200)
|
||||||
|
]
|
||||||
|
|
||||||
|
for future in as_completed(futures):
|
||||||
|
homes.extend(future.result())
|
||||||
|
|
||||||
|
return homes
|
||||||
|
|
|
@ -93,6 +93,35 @@ class RedfinScraper(Scraper):
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _parse_building(self, building: dict) -> Property:
|
||||||
|
return Property(
|
||||||
|
site_name=self.site_name,
|
||||||
|
property_type=PropertyType("BUILDING"),
|
||||||
|
address=Address(
|
||||||
|
street_address=" ".join(
|
||||||
|
[
|
||||||
|
building['address']['streetNumber'],
|
||||||
|
building['address']['directionalPrefix'],
|
||||||
|
building['address']['streetName'],
|
||||||
|
building['address']['streetType'],
|
||||||
|
]
|
||||||
|
),
|
||||||
|
city=building['address']['city'],
|
||||||
|
state=building['address']['stateOrProvinceCode'],
|
||||||
|
zip_code=building['address']['postalCode'],
|
||||||
|
unit=" ".join(
|
||||||
|
[
|
||||||
|
building['address']['unitType'],
|
||||||
|
building['address']['unitValue'],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||||
|
listing_type=self.listing_type,
|
||||||
|
bldg_unit_count=building["numUnitsForSale"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def handle_address(self, home_id: str):
|
def handle_address(self, home_id: str):
|
||||||
"""
|
"""
|
||||||
EPs:
|
EPs:
|
||||||
|
@ -130,5 +159,8 @@ class RedfinScraper(Scraper):
|
||||||
|
|
||||||
homes = [
|
homes = [
|
||||||
self._parse_home(home) for home in response_json["payload"]["homes"]
|
self._parse_home(home) for home in response_json["payload"]["homes"]
|
||||||
] #: support buildings
|
] + [
|
||||||
|
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
|
||||||
|
]
|
||||||
|
|
||||||
return homes
|
return homes
|
||||||
|
|
|
@ -117,11 +117,10 @@ class ZillowScraper(Scraper):
|
||||||
"isDebugRequest": False,
|
"isDebugRequest": False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
print(payload)
|
|
||||||
resp = self.session.put(url, headers=self._get_headers(), data=payload)
|
resp = self.session.put(url, headers=self._get_headers(), data=payload)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
a = resp.json()
|
a = resp.json()
|
||||||
return parse_properties(resp.json())
|
return self._parse_properties(resp.json())
|
||||||
|
|
||||||
def _parse_properties(self, property_data: dict):
|
def _parse_properties(self, property_data: dict):
|
||||||
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
|
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
|
||||||
|
@ -129,7 +128,6 @@ class ZillowScraper(Scraper):
|
||||||
properties_list = []
|
properties_list = []
|
||||||
|
|
||||||
for result in mapresults:
|
for result in mapresults:
|
||||||
try:
|
|
||||||
if "hdpData" in result:
|
if "hdpData" in result:
|
||||||
home_info = result["hdpData"]["homeInfo"]
|
home_info = result["hdpData"]["homeInfo"]
|
||||||
address_data = {
|
address_data = {
|
||||||
|
@ -217,11 +215,6 @@ class ZillowScraper(Scraper):
|
||||||
building_obj = Property(**building_data)
|
building_obj = Property(**building_data)
|
||||||
properties_list.append(building_obj)
|
properties_list.append(building_obj)
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(home_info)
|
|
||||||
traceback.print_exc()
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
return properties_list
|
return properties_list
|
||||||
|
|
||||||
def _extract_units(self, result: dict):
|
def _extract_units(self, result: dict):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.1.2"
|
version = "0.1.3"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
|
|
|
@ -3,6 +3,9 @@ from homeharvest import scrape_property
|
||||||
|
|
||||||
def test_realtor():
|
def test_realtor():
|
||||||
results = [
|
results = [
|
||||||
|
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"),
|
||||||
|
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format
|
||||||
|
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format
|
||||||
scrape_property(location="85281", site_name="realtor.com"),
|
scrape_property(location="85281", site_name="realtor.com"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue