mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 12:04:31 -08:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6c6fef80ed | ||
|
|
62e3321277 | ||
|
|
80186ee8c5 | ||
|
|
3ec47c5b6a | ||
|
|
42e8ac4de9 |
@@ -67,8 +67,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# check rentals\n",
|
"# check rentals\n",
|
||||||
"scrape_property(\n",
|
"scrape_property(\n",
|
||||||
" location=\"chicago\",\n",
|
" location=\"chicago, illinois\",\n",
|
||||||
" site_name=[\"redfin\", \"realtor.com\"],\n",
|
" site_name=[\"redfin\", \"zillow\"],\n",
|
||||||
" listing_type=\"for_rent\"\n",
|
" listing_type=\"for_rent\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@@ -87,7 +87,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# check sold properties\n",
|
"# check sold properties\n",
|
||||||
"scrape_property(\n",
|
"scrape_property(\n",
|
||||||
" location=\"chicago, illinois\",\n",
|
" location=\"90210\",\n",
|
||||||
" site_name=[\"redfin\"],\n",
|
" site_name=[\"redfin\"],\n",
|
||||||
" listing_type=\"sold\"\n",
|
" listing_type=\"sold\"\n",
|
||||||
")"
|
")"
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ _scrapers = {
|
|||||||
"zillow": ZillowScraper,
|
"zillow": ZillowScraper,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def validate_input(site_name: str, listing_type: str) -> None:
|
def validate_input(site_name: str, listing_type: str) -> None:
|
||||||
if site_name.lower() not in _scrapers:
|
if site_name.lower() not in _scrapers:
|
||||||
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
||||||
@@ -26,6 +27,7 @@ def validate_input(site_name: str, listing_type: str) -> None:
|
|||||||
f"Provided listing type, '{listing_type}', does not exist."
|
f"Provided listing type, '{listing_type}', does not exist."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_ordered_properties(result: Property) -> list[str]:
|
def get_ordered_properties(result: Property) -> list[str]:
|
||||||
return [
|
return [
|
||||||
"property_url",
|
"property_url",
|
||||||
@@ -65,6 +67,7 @@ def get_ordered_properties(result: Property) -> list[str]:
|
|||||||
"longitude",
|
"longitude",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def process_result(result: Property) -> pd.DataFrame:
|
def process_result(result: Property) -> pd.DataFrame:
|
||||||
prop_data = result.__dict__
|
prop_data = result.__dict__
|
||||||
|
|
||||||
@@ -90,6 +93,7 @@ def process_result(result: Property) -> pd.DataFrame:
|
|||||||
|
|
||||||
return properties_df
|
return properties_df
|
||||||
|
|
||||||
|
|
||||||
def _scrape_single_site(
|
def _scrape_single_site(
|
||||||
location: str, site_name: str, listing_type: str
|
location: str, site_name: str, listing_type: str
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
@@ -108,7 +112,9 @@ def _scrape_single_site(
|
|||||||
results = site.search()
|
results = site.search()
|
||||||
|
|
||||||
properties_dfs = [process_result(result) for result in results]
|
properties_dfs = [process_result(result) for result in results]
|
||||||
properties_dfs = [df.dropna(axis=1, how='all') for df in properties_dfs if not df.empty]
|
properties_dfs = [
|
||||||
|
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
|
||||||
|
]
|
||||||
if not properties_dfs:
|
if not properties_dfs:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
@@ -143,7 +149,9 @@ def scrape_property(
|
|||||||
else:
|
else:
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
futures = {
|
futures = {
|
||||||
executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
|
executor.submit(
|
||||||
|
_scrape_single_site, location, s_name, listing_type
|
||||||
|
): s_name
|
||||||
for s_name in site_name
|
for s_name in site_name
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -157,5 +165,15 @@ def scrape_property(
|
|||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
final_df = pd.concat(results, ignore_index=True)
|
final_df = pd.concat(results, ignore_index=True)
|
||||||
final_df = final_df.drop_duplicates(subset=["street_address", "city", "unit"], keep="first")
|
|
||||||
return final_df
|
columns_to_track = ["street_address", "city", "unit"]
|
||||||
|
|
||||||
|
#: validate they exist, otherwise create them
|
||||||
|
for col in columns_to_track:
|
||||||
|
if col not in final_df.columns:
|
||||||
|
final_df[col] = None
|
||||||
|
|
||||||
|
final_df = final_df.drop_duplicates(
|
||||||
|
subset=["street_address", "city", "unit"], keep="first"
|
||||||
|
)
|
||||||
|
return final_df
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
result = response_json["autocomplete"]
|
result = response_json["autocomplete"]
|
||||||
|
|
||||||
if result is None:
|
if not result:
|
||||||
raise NoResultsFound("No results found for location: " + self.location)
|
raise NoResultsFound("No results found for location: " + self.location)
|
||||||
|
|
||||||
return result[0]
|
return result[0]
|
||||||
@@ -249,8 +249,20 @@ class RealtorScraper(Scraper):
|
|||||||
unit=parse_unit(result["location"]["address"]["unit"]),
|
unit=parse_unit(result["location"]["address"]["unit"]),
|
||||||
country="USA",
|
country="USA",
|
||||||
),
|
),
|
||||||
latitude=result["location"]["address"]["coordinate"]["lat"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lat" in result["location"]["address"]["coordinate"] else None,
|
latitude=result["location"]["address"]["coordinate"]["lat"]
|
||||||
longitude=result["location"]["address"]["coordinate"]["lon"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lon" in result["location"]["address"]["coordinate"] else None,
|
if result
|
||||||
|
and result.get("location")
|
||||||
|
and result["location"].get("address")
|
||||||
|
and result["location"]["address"].get("coordinate")
|
||||||
|
and "lat" in result["location"]["address"]["coordinate"]
|
||||||
|
else None,
|
||||||
|
longitude=result["location"]["address"]["coordinate"]["lon"]
|
||||||
|
if result
|
||||||
|
and result.get("location")
|
||||||
|
and result["location"].get("address")
|
||||||
|
and result["location"]["address"].get("coordinate")
|
||||||
|
and "lon" in result["location"]["address"]["coordinate"]
|
||||||
|
else None,
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
+ result["property_id"],
|
+ result["property_id"],
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from typing import Any
|
|||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two, parse_unit
|
from ....utils import parse_address_two, parse_unit
|
||||||
from ..models import Property, Address, PropertyType
|
from ..models import Property, Address, PropertyType
|
||||||
|
from ....exceptions import NoResultsFound
|
||||||
|
|
||||||
|
|
||||||
class RedfinScraper(Scraper):
|
class RedfinScraper(Scraper):
|
||||||
@@ -26,6 +27,11 @@ class RedfinScraper(Scraper):
|
|||||||
elif match_type == "1":
|
elif match_type == "1":
|
||||||
return "address" #: address, needs to be handled differently
|
return "address" #: address, needs to be handled differently
|
||||||
|
|
||||||
|
if "exactMatch" not in response_json["payload"]:
|
||||||
|
raise NoResultsFound(
|
||||||
|
"No results found for location: {}".format(self.location)
|
||||||
|
)
|
||||||
|
|
||||||
if response_json["payload"]["exactMatch"] is not None:
|
if response_json["payload"]["exactMatch"] is not None:
|
||||||
target = response_json["payload"]["exactMatch"]
|
target = response_json["payload"]["exactMatch"]
|
||||||
else:
|
else:
|
||||||
@@ -94,8 +100,12 @@ class RedfinScraper(Scraper):
|
|||||||
price_per_sqft=get_value("pricePerSqFt"),
|
price_per_sqft=get_value("pricePerSqFt"),
|
||||||
price=get_value("price"),
|
price=get_value("price"),
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
|
latitude=home["latLong"]["latitude"]
|
||||||
longitude = home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None
|
if "latLong" in home and "latitude" in home["latLong"]
|
||||||
|
else None,
|
||||||
|
longitude=home["latLong"]["longitude"]
|
||||||
|
if "latLong" in home and "longitude" in home["latLong"]
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_building(self, building: dict) -> Property:
|
def _parse_building(self, building: dict) -> Property:
|
||||||
|
|||||||
@@ -1,15 +1,18 @@
|
|||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import string
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two, parse_unit
|
from ....utils import parse_address_two, parse_unit
|
||||||
from ....exceptions import GeoCoordsNotFound
|
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||||
from ..models import Property, Address, ListingType, PropertyType, SiteName
|
from ..models import Property, Address, ListingType, PropertyType
|
||||||
|
|
||||||
|
|
||||||
class ZillowScraper(Scraper):
|
class ZillowScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
|
if not self.is_plausible_location(self.location):
|
||||||
|
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
||||||
if self.listing_type == ListingType.FOR_SALE:
|
if self.listing_type == ListingType.FOR_SALE:
|
||||||
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
||||||
elif self.listing_type == ListingType.FOR_RENT:
|
elif self.listing_type == ListingType.FOR_RENT:
|
||||||
@@ -17,6 +20,18 @@ class ZillowScraper(Scraper):
|
|||||||
else:
|
else:
|
||||||
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_plausible_location(location: str) -> bool:
|
||||||
|
blocks = location.split()
|
||||||
|
for block in blocks:
|
||||||
|
if (
|
||||||
|
any(char.isdigit() for char in block)
|
||||||
|
and any(char.isalpha() for char in block)
|
||||||
|
and len(block) > 6
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
resp = self.session.get(self.url, headers=self._get_headers())
|
resp = self.session.get(self.url, headers=self._get_headers())
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
@@ -151,9 +166,6 @@ class ZillowScraper(Scraper):
|
|||||||
else None,
|
else None,
|
||||||
"currency": home_info["currency"],
|
"currency": home_info["currency"],
|
||||||
"price": home_info.get("price"),
|
"price": home_info.get("price"),
|
||||||
"square_feet": int(home_info["livingArea"])
|
|
||||||
if "livingArea" in home_info
|
|
||||||
else None,
|
|
||||||
"tax_assessed_value": int(home_info["taxAssessedValue"])
|
"tax_assessed_value": int(home_info["taxAssessedValue"])
|
||||||
if "taxAssessedValue" in home_info
|
if "taxAssessedValue" in home_info
|
||||||
else None,
|
else None,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.2.0"
|
version = "0.2.1"
|
||||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
|
|||||||
@@ -1,4 +1,10 @@
|
|||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_realtor():
|
def test_realtor():
|
||||||
@@ -18,3 +24,17 @@ def test_realtor():
|
|||||||
]
|
]
|
||||||
|
|
||||||
assert all([result is not None for result in results])
|
assert all([result is not None for result in results])
|
||||||
|
|
||||||
|
bad_results = []
|
||||||
|
try:
|
||||||
|
bad_results += [
|
||||||
|
scrape_property(
|
||||||
|
location="abceefg ju098ot498hh9",
|
||||||
|
site_name="realtor.com",
|
||||||
|
listing_type="for_sale",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
||||||
|
assert True
|
||||||
|
|
||||||
|
assert all([result is None for result in bad_results])
|
||||||
|
|||||||
@@ -1,4 +1,10 @@
|
|||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
@@ -16,3 +22,17 @@ def test_redfin():
|
|||||||
]
|
]
|
||||||
|
|
||||||
assert all([result is not None for result in results])
|
assert all([result is not None for result in results])
|
||||||
|
|
||||||
|
bad_results = []
|
||||||
|
try:
|
||||||
|
bad_results += [
|
||||||
|
scrape_property(
|
||||||
|
location="abceefg ju098ot498hh9",
|
||||||
|
site_name="redfin",
|
||||||
|
listing_type="for_sale",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
||||||
|
assert True
|
||||||
|
|
||||||
|
assert all([result is None for result in bad_results])
|
||||||
|
|||||||
@@ -1,4 +1,10 @@
|
|||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_zillow():
|
def test_zillow():
|
||||||
@@ -16,3 +22,17 @@ def test_zillow():
|
|||||||
]
|
]
|
||||||
|
|
||||||
assert all([result is not None for result in results])
|
assert all([result is not None for result in results])
|
||||||
|
|
||||||
|
bad_results = []
|
||||||
|
try:
|
||||||
|
bad_results += [
|
||||||
|
scrape_property(
|
||||||
|
location="abceefg ju098ot498hh9",
|
||||||
|
site_name="zillow",
|
||||||
|
listing_type="for_sale",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
||||||
|
assert True
|
||||||
|
|
||||||
|
assert all([result is None for result in bad_results])
|
||||||
|
|||||||
Reference in New Issue
Block a user