mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 12:04:31 -08:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3b7c17b7b5 | ||
|
|
59317fd6fc | ||
|
|
928b431d1f | ||
|
|
896f862137 | ||
|
|
3174f5076c | ||
|
|
2abbb913a8 | ||
|
|
73b6d5b33f | ||
|
|
da39c989d9 | ||
|
|
01c53f9399 | ||
|
|
9200c17df2 | ||
|
|
9e262bf214 | ||
|
|
82f78fb578 | ||
|
|
b0e40df00a | ||
|
|
2fc40e0dad | ||
|
|
254f3a68a1 | ||
|
|
05713c76b0 | ||
|
|
9120cc9bfe | ||
|
|
eee4b19515 | ||
|
|
c25961eded | ||
|
|
0884c3d163 | ||
|
|
8f37bfdeb8 | ||
|
|
48c2338276 |
@@ -1,13 +1,16 @@
|
|||||||
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
|
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
|
||||||
|
|
||||||
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
|
|
||||||
|
|
||||||
|
|
||||||
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
|
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
|
||||||
|
|
||||||
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
||||||
|
|
||||||
|
\
|
||||||
|
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
|
||||||
|
|
||||||
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
||||||
|
|
||||||
|
Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/JobSpy)** – a Python package for job scraping*
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
|
||||||
|
|||||||
@@ -57,6 +57,10 @@ def _get_ordered_properties(result: Property) -> list[str]:
|
|||||||
"stories",
|
"stories",
|
||||||
"year_built",
|
"year_built",
|
||||||
"agent_name",
|
"agent_name",
|
||||||
|
"agent_phone",
|
||||||
|
"agent_email",
|
||||||
|
"days_on_market",
|
||||||
|
"sold_date",
|
||||||
"mls_id",
|
"mls_id",
|
||||||
"img_src",
|
"img_src",
|
||||||
"latitude",
|
"latitude",
|
||||||
@@ -84,6 +88,18 @@ def _process_result(result: Property) -> pd.DataFrame:
|
|||||||
|
|
||||||
del prop_data["address"]
|
del prop_data["address"]
|
||||||
|
|
||||||
|
if "agent" in prop_data and prop_data["agent"] is not None:
|
||||||
|
agent_data = prop_data["agent"]
|
||||||
|
prop_data["agent_name"] = agent_data.name
|
||||||
|
prop_data["agent_phone"] = agent_data.phone
|
||||||
|
prop_data["agent_email"] = agent_data.email
|
||||||
|
|
||||||
|
del prop_data["agent"]
|
||||||
|
else:
|
||||||
|
prop_data["agent_name"] = None
|
||||||
|
prop_data["agent_phone"] = None
|
||||||
|
prop_data["agent_email"] = None
|
||||||
|
|
||||||
properties_df = pd.DataFrame([prop_data])
|
properties_df = pd.DataFrame([prop_data])
|
||||||
properties_df = properties_df[_get_ordered_properties(result)]
|
properties_df = properties_df[_get_ordered_properties(result)]
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import requests
|
import requests
|
||||||
|
import tls_client
|
||||||
from .models import Property, ListingType, SiteName
|
from .models import Property, ListingType, SiteName
|
||||||
|
|
||||||
|
|
||||||
@@ -12,15 +13,20 @@ class ScraperInput:
|
|||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, scraper_input: ScraperInput):
|
def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_client.Session = None):
|
||||||
self.location = scraper_input.location
|
self.location = scraper_input.location
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
|
|
||||||
self.session = requests.Session()
|
if not session:
|
||||||
|
self.session = requests.Session()
|
||||||
|
else:
|
||||||
|
self.session = session
|
||||||
|
|
||||||
if scraper_input.proxy:
|
if scraper_input.proxy:
|
||||||
proxy_url = scraper_input.proxy
|
proxy_url = scraper_input.proxy
|
||||||
proxies = {"http": proxy_url, "https": proxy_url}
|
proxies = {"http": proxy_url, "https": proxy_url}
|
||||||
self.session.proxies.update(proxies)
|
self.session.proxies.update(proxies)
|
||||||
|
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
self.site_name = scraper_input.site_name
|
self.site_name = scraper_input.site_name
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
class SiteName(Enum):
|
class SiteName(Enum):
|
||||||
@@ -64,6 +65,13 @@ class Address:
|
|||||||
zip_code: str | None = None
|
zip_code: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Agent:
|
||||||
|
name: str
|
||||||
|
phone: str | None = None
|
||||||
|
email: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Property:
|
class Property:
|
||||||
property_url: str
|
property_url: str
|
||||||
@@ -81,11 +89,11 @@ class Property:
|
|||||||
price_per_sqft: int | None = None
|
price_per_sqft: int | None = None
|
||||||
mls_id: str | None = None
|
mls_id: str | None = None
|
||||||
|
|
||||||
agent_name: str | None = None
|
agent: Agent | None = None
|
||||||
img_src: str | None = None
|
img_src: str | None = None
|
||||||
description: str | None = None
|
description: str | None = None
|
||||||
status_text: str | None = None
|
status_text: str | None = None
|
||||||
posted_time: str | None = None
|
posted_time: datetime | None = None
|
||||||
|
|
||||||
# building for sale
|
# building for sale
|
||||||
bldg_name: str | None = None
|
bldg_name: str | None = None
|
||||||
@@ -107,3 +115,6 @@ class Property:
|
|||||||
|
|
||||||
latitude: float | None = None
|
latitude: float | None = None
|
||||||
longitude: float | None = None
|
longitude: float | None = None
|
||||||
|
|
||||||
|
sold_date: datetime | None = None
|
||||||
|
days_on_market: int | None = None
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ import json
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two, parse_address_one
|
from ....utils import parse_address_two, parse_address_one
|
||||||
from ..models import Property, Address, PropertyType, ListingType, SiteName
|
from ..models import Property, Address, PropertyType, ListingType, SiteName, Agent
|
||||||
from ....exceptions import NoResultsFound
|
from ....exceptions import NoResultsFound, SearchTooBroad
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
class RedfinScraper(Scraper):
|
class RedfinScraper(Scraper):
|
||||||
@@ -30,6 +31,8 @@ class RedfinScraper(Scraper):
|
|||||||
return "6" #: city
|
return "6" #: city
|
||||||
elif match_type == "1":
|
elif match_type == "1":
|
||||||
return "address" #: address, needs to be handled differently
|
return "address" #: address, needs to be handled differently
|
||||||
|
elif match_type == "11":
|
||||||
|
return "state"
|
||||||
|
|
||||||
if "exactMatch" not in response_json["payload"]:
|
if "exactMatch" not in response_json["payload"]:
|
||||||
raise NoResultsFound("No results found for location: {}".format(self.location))
|
raise NoResultsFound("No results found for location: {}".format(self.location))
|
||||||
@@ -74,6 +77,8 @@ class RedfinScraper(Scraper):
|
|||||||
else:
|
else:
|
||||||
lot_size = lot_size_data
|
lot_size = lot_size_data
|
||||||
|
|
||||||
|
lat_long = get_value("latLong")
|
||||||
|
|
||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
@@ -88,15 +93,20 @@ class RedfinScraper(Scraper):
|
|||||||
sqft_min=get_value("sqFt"),
|
sqft_min=get_value("sqFt"),
|
||||||
sqft_max=get_value("sqFt"),
|
sqft_max=get_value("sqFt"),
|
||||||
stories=home["stories"] if "stories" in home else None,
|
stories=home["stories"] if "stories" in home else None,
|
||||||
agent_name=get_value("listingAgent"),
|
agent=Agent( #: listingAgent, some have sellingAgent as well
|
||||||
|
name=home['listingAgent'].get('name') if 'listingAgent' in home else None,
|
||||||
|
phone=home['listingAgent'].get('phone') if 'listingAgent' in home else None,
|
||||||
|
),
|
||||||
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
||||||
year_built=get_value("yearBuilt") if not single_search else home["yearBuilt"],
|
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
|
||||||
lot_area_value=lot_size,
|
lot_area_value=lot_size,
|
||||||
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
||||||
price_per_sqft=get_value("pricePerSqFt"),
|
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
|
latitude=lat_long.get('latitude') if lat_long else None,
|
||||||
longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None,
|
longitude=lat_long.get('longitude') if lat_long else None,
|
||||||
|
sold_date=datetime.fromtimestamp(home['soldDate'] / 1000) if 'soldDate' in home else None,
|
||||||
|
days_on_market=get_value("dom")
|
||||||
)
|
)
|
||||||
|
|
||||||
def _handle_rentals(self, region_id, region_type):
|
def _handle_rentals(self, region_id, region_type):
|
||||||
@@ -183,7 +193,7 @@ class RedfinScraper(Scraper):
|
|||||||
),
|
),
|
||||||
property_url="https://www.redfin.com{}".format(building["url"]),
|
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
unit_count=building["numUnitsForSale"],
|
unit_count=building.get("numUnitsForSale"),
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_address(self, home_id: str):
|
def handle_address(self, home_id: str):
|
||||||
@@ -207,6 +217,9 @@ class RedfinScraper(Scraper):
|
|||||||
def search(self):
|
def search(self):
|
||||||
region_id, region_type = self._handle_location()
|
region_id, region_type = self._handle_location()
|
||||||
|
|
||||||
|
if region_type == "state":
|
||||||
|
raise SearchTooBroad("State searches are not supported, please use a more specific location.")
|
||||||
|
|
||||||
if region_type == "address":
|
if region_type == "address":
|
||||||
home_id = region_id
|
home_id = region_id
|
||||||
return self.handle_address(home_id)
|
return self.handle_address(home_id)
|
||||||
@@ -220,7 +233,14 @@ class RedfinScraper(Scraper):
|
|||||||
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000"
|
url = f"https://www.redfin.com/stingray/api/gis?al=1®ion_id={region_id}®ion_type={region_type}&sold_within_days=30&num_homes=100000"
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||||
homes = [self._parse_home(home) for home in response_json["payload"]["homes"]] + [
|
|
||||||
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
|
if "payload" in response_json:
|
||||||
]
|
homes_list = response_json["payload"].get("homes", [])
|
||||||
return homes
|
buildings_list = response_json["payload"].get("buildings", {}).values()
|
||||||
|
|
||||||
|
homes = [self._parse_home(home) for home in homes_list] + [
|
||||||
|
self._parse_building(building) for building in buildings_list
|
||||||
|
]
|
||||||
|
return homes
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|||||||
@@ -6,15 +6,41 @@ This module implements the scraper for zillow.com
|
|||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
import tls_client
|
||||||
|
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
from ....utils import parse_address_one, parse_address_two
|
from ....utils import parse_address_one, parse_address_two
|
||||||
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||||
from ..models import Property, Address, ListingType, PropertyType
|
from ..models import Property, Address, ListingType, PropertyType, Agent
|
||||||
|
import urllib.parse
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
class ZillowScraper(Scraper):
|
class ZillowScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
session = tls_client.Session(
|
||||||
|
client_identifier="chrome112", random_tls_extension_order=True
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(scraper_input, session)
|
||||||
|
|
||||||
|
self.session.headers.update({
|
||||||
|
'authority': 'www.zillow.com',
|
||||||
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||||
|
'accept-language': 'en-US,en;q=0.9',
|
||||||
|
'cache-control': 'max-age=0',
|
||||||
|
'sec-ch-ua': '"Chromium";v="117", "Not)A;Brand";v="24", "Google Chrome";v="117"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-fetch-dest': 'document',
|
||||||
|
'sec-fetch-mode': 'navigate',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'sec-fetch-user': '?1',
|
||||||
|
'upgrade-insecure-requests': '1',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
|
||||||
|
})
|
||||||
|
|
||||||
if not self.is_plausible_location(self.location):
|
if not self.is_plausible_location(self.location):
|
||||||
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
||||||
@@ -31,15 +57,18 @@ class ZillowScraper(Scraper):
|
|||||||
url = (
|
url = (
|
||||||
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
|
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
|
||||||
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
|
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
|
||||||
).format(location)
|
).format(urllib.parse.quote(location))
|
||||||
|
|
||||||
response = self.session.get(url)
|
resp = self.session.get(url)
|
||||||
|
|
||||||
return response.json()["results"] != []
|
return resp.json()["results"] != []
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
resp = self.session.get(self.url, headers=self._get_headers())
|
resp = self.session.get(self.url)
|
||||||
resp.raise_for_status()
|
if resp.status_code != 200:
|
||||||
|
raise HTTPError(
|
||||||
|
f"bad response status code: {resp.status_code}"
|
||||||
|
)
|
||||||
content = resp.text
|
content = resp.text
|
||||||
|
|
||||||
match = re.search(
|
match = re.search(
|
||||||
@@ -133,11 +162,23 @@ class ZillowScraper(Scraper):
|
|||||||
"wants": {"cat1": ["mapResults"]},
|
"wants": {"cat1": ["mapResults"]},
|
||||||
"isDebugRequest": False,
|
"isDebugRequest": False,
|
||||||
}
|
}
|
||||||
resp = self.session.put(url, headers=self._get_headers(), json=payload)
|
resp = self.session.put(url, json=payload)
|
||||||
resp.raise_for_status()
|
if resp.status_code != 200:
|
||||||
a = resp.json()
|
raise HTTPError(
|
||||||
|
f"bad response status code: {resp.status_code}"
|
||||||
|
)
|
||||||
return self._parse_properties(resp.json())
|
return self._parse_properties(resp.json())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_posted_time(time: str) -> datetime:
|
||||||
|
int_time = int(time.split(" ")[0])
|
||||||
|
|
||||||
|
if "hour" in time:
|
||||||
|
return datetime.now() - timedelta(hours=int_time)
|
||||||
|
|
||||||
|
if "day" in time:
|
||||||
|
return datetime.now() - timedelta(days=int_time)
|
||||||
|
|
||||||
def _parse_properties(self, property_data: dict):
|
def _parse_properties(self, property_data: dict):
|
||||||
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
|
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
|
||||||
|
|
||||||
@@ -147,26 +188,26 @@ class ZillowScraper(Scraper):
|
|||||||
if "hdpData" in result:
|
if "hdpData" in result:
|
||||||
home_info = result["hdpData"]["homeInfo"]
|
home_info = result["hdpData"]["homeInfo"]
|
||||||
address_data = {
|
address_data = {
|
||||||
"address_one": parse_address_one(home_info["streetAddress"])[0],
|
"address_one": parse_address_one(home_info.get("streetAddress"))[0],
|
||||||
"address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#",
|
"address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#",
|
||||||
"city": home_info["city"],
|
"city": home_info.get("city"),
|
||||||
"state": home_info["state"],
|
"state": home_info.get("state"),
|
||||||
"zip_code": home_info["zipcode"],
|
"zip_code": home_info.get("zipcode"),
|
||||||
}
|
}
|
||||||
property_obj = Property(
|
property_obj = Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
address=Address(**address_data),
|
address=Address(**address_data),
|
||||||
property_url=f"https://www.zillow.com{result['detailUrl']}",
|
property_url=f"https://www.zillow.com{result['detailUrl']}",
|
||||||
tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None,
|
tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None,
|
||||||
property_type=PropertyType(home_info["homeType"]),
|
property_type=PropertyType(home_info.get("homeType")),
|
||||||
listing_type=ListingType(
|
listing_type=ListingType(
|
||||||
home_info["statusType"] if "statusType" in home_info else self.listing_type
|
home_info["statusType"] if "statusType" in home_info else self.listing_type
|
||||||
),
|
),
|
||||||
status_text=result.get("statusText"),
|
status_text=result.get("statusText"),
|
||||||
posted_time=result["variableData"]["text"]
|
posted_time=self.parse_posted_time(result["variableData"]["text"])
|
||||||
if "variableData" in result
|
if "variableData" in result
|
||||||
and "text" in result["variableData"]
|
and "text" in result["variableData"]
|
||||||
and result["variableData"]["type"] == "TIME_ON_INFO"
|
and result["variableData"]["type"] == "TIME_ON_INFO"
|
||||||
else None,
|
else None,
|
||||||
price_min=home_info.get("price"),
|
price_min=home_info.get("price"),
|
||||||
price_max=home_info.get("price"),
|
price_max=home_info.get("price"),
|
||||||
@@ -198,18 +239,17 @@ class ZillowScraper(Scraper):
|
|||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_type=PropertyType("BUILDING"),
|
property_type=PropertyType("BUILDING"),
|
||||||
listing_type=ListingType(result["statusType"]),
|
listing_type=ListingType(result["statusType"]),
|
||||||
img_src=result["imgSrc"],
|
img_src=result.get("imgSrc"),
|
||||||
address=self._extract_address(result["address"]),
|
address=self._extract_address(result["address"]),
|
||||||
baths_min=result["minBaths"],
|
baths_min=result.get("minBaths"),
|
||||||
area_min=result.get("minArea"),
|
area_min=result.get("minArea"),
|
||||||
bldg_name=result.get("communityName"),
|
bldg_name=result.get("communityName"),
|
||||||
status_text=result["statusText"],
|
status_text=result.get("statusText"),
|
||||||
beds_min=result["minBeds"],
|
price_min=price_value if "+/mo" in result.get("price") else None,
|
||||||
price_min=price_value if "+/mo" in result["price"] else None,
|
price_max=price_value if "+/mo" in result.get("price") else None,
|
||||||
price_max=price_value if "+/mo" in result["price"] else None,
|
latitude=result.get("latLong", {}).get("latitude"),
|
||||||
latitude=result["latLong"]["latitude"],
|
longitude=result.get("latLong", {}).get("longitude"),
|
||||||
longitude=result["latLong"]["longitude"],
|
unit_count=result.get("unitCount"),
|
||||||
unit_count=result["unitCount"],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
properties_list.append(building_obj)
|
properties_list.append(building_obj)
|
||||||
@@ -238,14 +278,16 @@ class ZillowScraper(Scraper):
|
|||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_url=url,
|
property_url=url,
|
||||||
property_type=PropertyType(property_type),
|
property_type=PropertyType(property_type) if property_type in PropertyType.__members__ else None,
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
address=address,
|
address=address,
|
||||||
year_built=property_data.get("yearBuilt"),
|
year_built=property_data.get("yearBuilt"),
|
||||||
tax_assessed_value=property_data.get("taxAssessedValue"),
|
tax_assessed_value=property_data.get("taxAssessedValue"),
|
||||||
lot_area_value=property_data.get("lotAreaValue"),
|
lot_area_value=property_data.get("lotAreaValue"),
|
||||||
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
|
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
|
||||||
agent_name=property_data.get("attributionInfo", {}).get("agentName"),
|
agent=Agent(
|
||||||
|
name=property_data.get("attributionInfo", {}).get("agentName")
|
||||||
|
),
|
||||||
stories=property_data.get("resoFacts", {}).get("stories"),
|
stories=property_data.get("resoFacts", {}).get("stories"),
|
||||||
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
|
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
|
||||||
beds_min=property_data.get("bedrooms"),
|
beds_min=property_data.get("bedrooms"),
|
||||||
@@ -294,22 +336,3 @@ class ZillowScraper(Scraper):
|
|||||||
state=state,
|
state=state,
|
||||||
zip_code=zip_code,
|
zip_code=zip_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_headers():
|
|
||||||
return {
|
|
||||||
"authority": "www.zillow.com",
|
|
||||||
"accept": "*/*",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"content-type": "application/json",
|
|
||||||
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
|
|
||||||
"origin": "https://www.zillow.com",
|
|
||||||
"referer": "https://www.zillow.com",
|
|
||||||
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
|
||||||
"sec-ch-ua-mobile": "?0",
|
|
||||||
"sec-ch-ua-platform": '"Windows"',
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -12,3 +12,7 @@ class NoResultsFound(Exception):
|
|||||||
|
|
||||||
class GeoCoordsNotFound(Exception):
|
class GeoCoordsNotFound(Exception):
|
||||||
"""Raised when no property is found for the given address"""
|
"""Raised when no property is found for the given address"""
|
||||||
|
|
||||||
|
|
||||||
|
class SearchTooBroad(Exception):
|
||||||
|
"""Raised when the search is too broad"""
|
||||||
|
|||||||
13
poetry.lock
generated
13
poetry.lock
generated
@@ -408,6 +408,17 @@ files = [
|
|||||||
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tls-client"
|
||||||
|
version = "0.2.2"
|
||||||
|
description = "Advanced Python HTTP Client."
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "tls_client-0.2.2-py3-none-any.whl", hash = "sha256:30934871397cdad6862e00b5634f382666314a452ddd3d774e18323a0ad9b765"},
|
||||||
|
{file = "tls_client-0.2.2.tar.gz", hash = "sha256:78bc0e291e3aadc6c5e903b62bb26c01374577691f2a9e5e17899900a5927a13"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tomli"
|
name = "tomli"
|
||||||
version = "2.0.1"
|
version = "2.0.1"
|
||||||
@@ -450,4 +461,4 @@ zstd = ["zstandard (>=0.18.0)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f"
|
content-hash = "9b77e1a09fcf2cf5e7e6be53f304cd21a6a51ea51680d661a178afe5e5343670"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.2.9"
|
version = "0.2.18"
|
||||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
@@ -14,6 +14,7 @@ python = "^3.10"
|
|||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
pandas = "^2.1.0"
|
pandas = "^2.1.0"
|
||||||
openpyxl = "^3.1.2"
|
openpyxl = "^3.1.2"
|
||||||
|
tls-client = "^0.2.2"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|||||||
@@ -4,11 +4,13 @@ from homeharvest.exceptions import (
|
|||||||
InvalidListingType,
|
InvalidListingType,
|
||||||
NoResultsFound,
|
NoResultsFound,
|
||||||
GeoCoordsNotFound,
|
GeoCoordsNotFound,
|
||||||
|
SearchTooBroad,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
results = [
|
results = [
|
||||||
|
scrape_property(location="San Diego", site_name="redfin", listing_type="for_sale"),
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
|
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
|
||||||
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
|
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
|
||||||
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
|
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
|
||||||
@@ -24,9 +26,10 @@ def test_redfin():
|
|||||||
location="abceefg ju098ot498hh9",
|
location="abceefg ju098ot498hh9",
|
||||||
site_name="redfin",
|
site_name="redfin",
|
||||||
listing_type="for_sale",
|
listing_type="for_sale",
|
||||||
)
|
),
|
||||||
|
scrape_property(location="Florida", site_name="redfin", listing_type="for_rent"),
|
||||||
]
|
]
|
||||||
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound, SearchTooBroad):
|
||||||
assert True
|
assert True
|
||||||
|
|
||||||
assert all([result is None for result in bad_results])
|
assert all([result is None for result in bad_results])
|
||||||
|
|||||||
@@ -11,8 +11,10 @@ def test_zillow():
|
|||||||
results = [
|
results = [
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"),
|
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"),
|
||||||
scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"),
|
scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"),
|
||||||
|
scrape_property(location="Surprise, AZ", site_name=["zillow"], listing_type="for_sale"),
|
||||||
scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"),
|
scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"),
|
||||||
scrape_property(location="85281", site_name="zillow"),
|
scrape_property(location="85281", site_name="zillow"),
|
||||||
|
scrape_property(location="3268 88th st s, Lakewood", site_name="zillow", listing_type="for_rent"),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert all([result is not None for result in results])
|
assert all([result is not None for result in results])
|
||||||
|
|||||||
Reference in New Issue
Block a user