Compare commits

...

10 Commits

Author SHA1 Message Date
Cullen Watson
b0e40df00a Update pyproject.toml 2023-09-22 09:51:24 -05:00
Cullen Watson
2fc40e0dad fix: cookie 2023-09-22 09:47:37 -05:00
Zachary Hampton
254f3a68a1 - redfin bug fix 2023-09-21 18:54:03 -07:00
Zachary Hampton
05713c76b0 - redfin bug fix
- .get
2023-09-21 11:27:12 -07:00
Cullen Watson
9120cc9bfe fix: remove line 2023-09-21 13:10:14 -05:00
Cullen Watson
eee4b19515 Merge branch 'master' of https://github.com/ZacharyHampton/HomeHarvest 2023-09-21 13:06:15 -05:00
Cullen Watson
c25961eded fix: KeyEror : [minBaths] 2023-09-21 13:06:06 -05:00
Zachary Hampton
0884c3d163 Update README.md 2023-09-21 09:55:29 -07:00
Cullen Watson
8f37bfdeb8 chore: version number 2023-09-21 11:19:23 -05:00
Cullen Watson
48c2338276 fix: keyerror 2023-09-21 11:18:37 -05:00
4 changed files with 41 additions and 29 deletions

View File

@@ -1,13 +1,16 @@
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400"> <img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library. **HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo) [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
\
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.* *Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/JobSpy)** a Python package for job scraping*
## Features ## Features
- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously - Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously

View File

@@ -90,10 +90,10 @@ class RedfinScraper(Scraper):
stories=home["stories"] if "stories" in home else None, stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"), agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None, description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt") if not single_search else home["yearBuilt"], year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
lot_area_value=lot_size, lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")), property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_sqft=get_value("pricePerSqFt"), price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None, latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None, longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None,
@@ -183,7 +183,7 @@ class RedfinScraper(Scraper):
), ),
property_url="https://www.redfin.com{}".format(building["url"]), property_url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type, listing_type=self.listing_type,
unit_count=building["numUnitsForSale"], unit_count=building.get("numUnitsForSale"),
) )
def handle_address(self, home_id: str): def handle_address(self, home_id: str):
@@ -220,7 +220,14 @@ class RedfinScraper(Scraper):
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000" url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
response = self.session.get(url) response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", "")) response_json = json.loads(response.text.replace("{}&&", ""))
homes = [self._parse_home(home) for home in response_json["payload"]["homes"]] + [
self._parse_building(building) for building in response_json["payload"]["buildings"].values() if "payload" in response_json:
homes_list = response_json["payload"].get("homes", [])
buildings_list = response_json["payload"].get("buildings", {}).values()
homes = [self._parse_home(home) for home in homes_list] + [
self._parse_building(building) for building in buildings_list
] ]
return homes return homes
else:
return []

View File

@@ -15,6 +15,7 @@ from ..models import Property, Address, ListingType, PropertyType
class ZillowScraper(Scraper): class ZillowScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
self.cookies = None
if not self.is_plausible_location(self.location): if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location)) raise NoResultsFound("Invalid location input: {}".format(self.location))
@@ -135,6 +136,7 @@ class ZillowScraper(Scraper):
} }
resp = self.session.put(url, headers=self._get_headers(), json=payload) resp = self.session.put(url, headers=self._get_headers(), json=payload)
resp.raise_for_status() resp.raise_for_status()
self.cookies = resp.cookies
a = resp.json() a = resp.json()
return self._parse_properties(resp.json()) return self._parse_properties(resp.json())
@@ -147,18 +149,18 @@ class ZillowScraper(Scraper):
if "hdpData" in result: if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"] home_info = result["hdpData"]["homeInfo"]
address_data = { address_data = {
"address_one": parse_address_one(home_info["streetAddress"])[0], "address_one": parse_address_one(home_info.get("streetAddress"))[0],
"address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#", "address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#",
"city": home_info["city"], "city": home_info.get("city"),
"state": home_info["state"], "state": home_info.get("state"),
"zip_code": home_info["zipcode"], "zip_code": home_info.get("zipcode"),
} }
property_obj = Property( property_obj = Property(
site_name=self.site_name, site_name=self.site_name,
address=Address(**address_data), address=Address(**address_data),
property_url=f"https://www.zillow.com{result['detailUrl']}", property_url=f"https://www.zillow.com{result['detailUrl']}",
tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None, tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None,
property_type=PropertyType(home_info["homeType"]), property_type=PropertyType(home_info.get("homeType")),
listing_type=ListingType( listing_type=ListingType(
home_info["statusType"] if "statusType" in home_info else self.listing_type home_info["statusType"] if "statusType" in home_info else self.listing_type
), ),
@@ -198,18 +200,17 @@ class ZillowScraper(Scraper):
site_name=self.site_name, site_name=self.site_name,
property_type=PropertyType("BUILDING"), property_type=PropertyType("BUILDING"),
listing_type=ListingType(result["statusType"]), listing_type=ListingType(result["statusType"]),
img_src=result["imgSrc"], img_src=result.get("imgSrc"),
address=self._extract_address(result["address"]), address=self._extract_address(result["address"]),
baths_min=result["minBaths"], baths_min=result.get("minBaths"),
area_min=result.get("minArea"), area_min=result.get("minArea"),
bldg_name=result.get("communityName"), bldg_name=result.get("communityName"),
status_text=result["statusText"], status_text=result.get("statusText"),
beds_min=result["minBeds"], price_min=price_value if "+/mo" in result.get("price") else None,
price_min=price_value if "+/mo" in result["price"] else None, price_max=price_value if "+/mo" in result.get("price") else None,
price_max=price_value if "+/mo" in result["price"] else None, latitude=result.get("latLong", {}).get("latitude"),
latitude=result["latLong"]["latitude"], longitude=result.get("latLong", {}).get("longitude"),
longitude=result["latLong"]["longitude"], unit_count=result.get("unitCount"),
unit_count=result["unitCount"],
) )
properties_list.append(building_obj) properties_list.append(building_obj)
@@ -295,14 +296,12 @@ class ZillowScraper(Scraper):
zip_code=zip_code, zip_code=zip_code,
) )
@staticmethod def _get_headers(self):
def _get_headers(): headers = {
return {
"authority": "www.zillow.com", "authority": "www.zillow.com",
"accept": "*/*", "accept": "*/*",
"accept-language": "en-US,en;q=0.9", "accept-language": "en-US,en;q=0.9",
"content-type": "application/json", "content-type": "application/json",
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
"origin": "https://www.zillow.com", "origin": "https://www.zillow.com",
"referer": "https://www.zillow.com", "referer": "https://www.zillow.com",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
@@ -313,3 +312,6 @@ class ZillowScraper(Scraper):
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
} }
if self.cookies:
headers['Cookie'] = self.cookies
return headers

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.2.9" version = "0.2.14"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"