Merge pull request #23 from ZacharyHampton/fix/recent-issues

Fixes & Changes for recent issues
pull/27/head v0.2.16
Zachary Hampton 2023-09-28 18:07:55 -07:00 committed by GitHub
commit 3174f5076c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 123 additions and 35 deletions

View File

@ -57,6 +57,10 @@ def _get_ordered_properties(result: Property) -> list[str]:
"stories", "stories",
"year_built", "year_built",
"agent_name", "agent_name",
"agent_phone",
"agent_email",
"days_on_market",
"sold_date",
"mls_id", "mls_id",
"img_src", "img_src",
"latitude", "latitude",
@ -84,6 +88,18 @@ def _process_result(result: Property) -> pd.DataFrame:
del prop_data["address"] del prop_data["address"]
if "agent" in prop_data and prop_data["agent"] is not None:
agent_data = prop_data["agent"]
prop_data["agent_name"] = agent_data.name
prop_data["agent_phone"] = agent_data.phone
prop_data["agent_email"] = agent_data.email
del prop_data["agent"]
else:
prop_data["agent_name"] = None
prop_data["agent_phone"] = None
prop_data["agent_email"] = None
properties_df = pd.DataFrame([prop_data]) properties_df = pd.DataFrame([prop_data])
properties_df = properties_df[_get_ordered_properties(result)] properties_df = properties_df[_get_ordered_properties(result)]

View File

@ -1,6 +1,7 @@
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Tuple from typing import Tuple
from datetime import datetime
class SiteName(Enum): class SiteName(Enum):
@ -64,6 +65,13 @@ class Address:
zip_code: str | None = None zip_code: str | None = None
@dataclass
class Agent:
name: str
phone: str | None = None
email: str | None = None
@dataclass @dataclass
class Property: class Property:
property_url: str property_url: str
@ -81,11 +89,11 @@ class Property:
price_per_sqft: int | None = None price_per_sqft: int | None = None
mls_id: str | None = None mls_id: str | None = None
agent_name: str | None = None agent: Agent | None = None
img_src: str | None = None img_src: str | None = None
description: str | None = None description: str | None = None
status_text: str | None = None status_text: str | None = None
posted_time: str | None = None posted_time: datetime | None = None
# building for sale # building for sale
bldg_name: str | None = None bldg_name: str | None = None
@ -107,3 +115,6 @@ class Property:
latitude: float | None = None latitude: float | None = None
longitude: float | None = None longitude: float | None = None
sold_date: datetime | None = None
days_on_market: int | None = None

View File

@ -8,8 +8,9 @@ import json
from typing import Any from typing import Any
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two, parse_address_one from ....utils import parse_address_two, parse_address_one
from ..models import Property, Address, PropertyType, ListingType, SiteName from ..models import Property, Address, PropertyType, ListingType, SiteName, Agent
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound, SearchTooBroad
from datetime import datetime
class RedfinScraper(Scraper): class RedfinScraper(Scraper):
@ -30,6 +31,8 @@ class RedfinScraper(Scraper):
return "6" #: city return "6" #: city
elif match_type == "1": elif match_type == "1":
return "address" #: address, needs to be handled differently return "address" #: address, needs to be handled differently
elif match_type == "11":
return "state"
if "exactMatch" not in response_json["payload"]: if "exactMatch" not in response_json["payload"]:
raise NoResultsFound("No results found for location: {}".format(self.location)) raise NoResultsFound("No results found for location: {}".format(self.location))
@ -74,6 +77,8 @@ class RedfinScraper(Scraper):
else: else:
lot_size = lot_size_data lot_size = lot_size_data
lat_long = get_value("latLong")
return Property( return Property(
site_name=self.site_name, site_name=self.site_name,
listing_type=self.listing_type, listing_type=self.listing_type,
@ -88,15 +93,20 @@ class RedfinScraper(Scraper):
sqft_min=get_value("sqFt"), sqft_min=get_value("sqFt"),
sqft_max=get_value("sqFt"), sqft_max=get_value("sqFt"),
stories=home["stories"] if "stories" in home else None, stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"), agent=Agent( #: listingAgent, some have sellingAgent as well
name=home['listingAgent'].get('name') if 'listingAgent' in home else None,
phone=home['listingAgent'].get('phone') if 'listingAgent' in home else None,
),
description=home["listingRemarks"] if "listingRemarks" in home else None, description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"), year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
lot_area_value=lot_size, lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")), property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"), price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None, latitude=lat_long.get('latitude') if lat_long else None,
longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None, longitude=lat_long.get('longitude') if lat_long else None,
sold_date=datetime.fromtimestamp(home['soldDate'] / 1000) if 'soldDate' in home else None,
days_on_market=get_value("dom")
) )
def _handle_rentals(self, region_id, region_type): def _handle_rentals(self, region_id, region_type):
@ -207,6 +217,9 @@ class RedfinScraper(Scraper):
def search(self): def search(self):
region_id, region_type = self._handle_location() region_id, region_type = self._handle_location()
if region_type == "state":
raise SearchTooBroad("State searches are not supported, please use a more specific location.")
if region_type == "address": if region_type == "address":
home_id = region_id home_id = region_id
return self.handle_address(home_id) return self.handle_address(home_id)

View File

@ -6,16 +6,25 @@ This module implements the scraper for zillow.com
""" """
import re import re
import json import json
import tls_client
from .. import Scraper from .. import Scraper
from requests.exceptions import HTTPError
from ....utils import parse_address_one, parse_address_two from ....utils import parse_address_one, parse_address_two
from ....exceptions import GeoCoordsNotFound, NoResultsFound from ....exceptions import GeoCoordsNotFound, NoResultsFound
from ..models import Property, Address, ListingType, PropertyType from ..models import Property, Address, ListingType, PropertyType, Agent
import urllib.parse
from datetime import datetime, timedelta
class ZillowScraper(Scraper): class ZillowScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
self.cookies = None self.cookies = None
self.session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
if not self.is_plausible_location(self.location): if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location)) raise NoResultsFound("Invalid location input: {}".format(self.location))
@ -32,15 +41,18 @@ class ZillowScraper(Scraper):
url = ( url = (
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={" "https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render" "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
).format(location) ).format(urllib.parse.quote(location))
response = self.session.get(url) resp = self.session.get(url)
return response.json()["results"] != [] return resp.json()["results"] != []
def search(self): def search(self):
resp = self.session.get(self.url, headers=self._get_headers()) resp = self.session.get(self.url, headers=self._get_headers())
resp.raise_for_status() if resp.status_code != 200:
raise HTTPError(
f"bad response status code: {resp.status_code}"
)
content = resp.text content = resp.text
match = re.search( match = re.search(
@ -135,11 +147,23 @@ class ZillowScraper(Scraper):
"isDebugRequest": False, "isDebugRequest": False,
} }
resp = self.session.put(url, headers=self._get_headers(), json=payload) resp = self.session.put(url, headers=self._get_headers(), json=payload)
resp.raise_for_status() if resp.status_code != 200:
raise HTTPError(
f"bad response status code: {resp.status_code}"
)
self.cookies = resp.cookies self.cookies = resp.cookies
a = resp.json()
return self._parse_properties(resp.json()) return self._parse_properties(resp.json())
@staticmethod
def parse_posted_time(time: str) -> datetime:
int_time = int(time.split(" ")[0])
if "hour" in time:
return datetime.now() - timedelta(hours=int_time)
if "day" in time:
return datetime.now() - timedelta(days=int_time)
def _parse_properties(self, property_data: dict): def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"] mapresults = property_data["cat1"]["searchResults"]["mapResults"]
@ -165,7 +189,7 @@ class ZillowScraper(Scraper):
home_info["statusType"] if "statusType" in home_info else self.listing_type home_info["statusType"] if "statusType" in home_info else self.listing_type
), ),
status_text=result.get("statusText"), status_text=result.get("statusText"),
posted_time=result["variableData"]["text"] posted_time=self.parse_posted_time(result["variableData"]["text"])
if "variableData" in result if "variableData" in result
and "text" in result["variableData"] and "text" in result["variableData"]
and result["variableData"]["type"] == "TIME_ON_INFO" and result["variableData"]["type"] == "TIME_ON_INFO"
@ -246,7 +270,9 @@ class ZillowScraper(Scraper):
tax_assessed_value=property_data.get("taxAssessedValue"), tax_assessed_value=property_data.get("taxAssessedValue"),
lot_area_value=property_data.get("lotAreaValue"), lot_area_value=property_data.get("lotAreaValue"),
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None, lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
agent_name=property_data.get("attributionInfo", {}).get("agentName"), agent=Agent(
name=property_data.get("attributionInfo", {}).get("agentName")
),
stories=property_data.get("resoFacts", {}).get("stories"), stories=property_data.get("resoFacts", {}).get("stories"),
mls_id=property_data.get("attributionInfo", {}).get("mlsId"), mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
beds_min=property_data.get("bedrooms"), beds_min=property_data.get("bedrooms"),
@ -298,20 +324,23 @@ class ZillowScraper(Scraper):
def _get_headers(self): def _get_headers(self):
headers = { headers = {
"authority": "www.zillow.com", 'authority': 'www.zillow.com',
"accept": "*/*", 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
"accept-language": "en-US,en;q=0.9", 'accept-language': 'en-US,en;q=0.9',
"content-type": "application/json", 'cache-control': 'max-age=0',
"origin": "https://www.zillow.com", 'cookie': '<your_cookie_here>',
"referer": "https://www.zillow.com", 'sec-ch-ua': '"Chromium";v="117", "Not)A;Brand";v="24", "Google Chrome";v="117"',
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'sec-ch-ua-mobile': '?0',
"sec-ch-ua-mobile": "?0", 'sec-ch-ua-platform': '"Windows"',
"sec-ch-ua-platform": '"Windows"', 'sec-fetch-dest': 'document',
"sec-fetch-dest": "empty", 'sec-fetch-mode': 'navigate',
"sec-fetch-mode": "cors", 'sec-fetch-site': 'same-origin',
"sec-fetch-site": "same-origin", 'sec-fetch-user': '?1',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", 'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
} }
if self.cookies: if self.cookies:
headers['Cookie'] = self.cookies headers['Cookie'] = self.cookies
return headers return headers

View File

@ -12,3 +12,7 @@ class NoResultsFound(Exception):
class GeoCoordsNotFound(Exception): class GeoCoordsNotFound(Exception):
"""Raised when no property is found for the given address""" """Raised when no property is found for the given address"""
class SearchTooBroad(Exception):
"""Raised when the search is too broad"""

13
poetry.lock generated
View File

@ -408,6 +408,17 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
] ]
[[package]]
name = "tls-client"
version = "0.2.2"
description = "Advanced Python HTTP Client."
optional = false
python-versions = "*"
files = [
{file = "tls_client-0.2.2-py3-none-any.whl", hash = "sha256:30934871397cdad6862e00b5634f382666314a452ddd3d774e18323a0ad9b765"},
{file = "tls_client-0.2.2.tar.gz", hash = "sha256:78bc0e291e3aadc6c5e903b62bb26c01374577691f2a9e5e17899900a5927a13"},
]
[[package]] [[package]]
name = "tomli" name = "tomli"
version = "2.0.1" version = "2.0.1"
@ -450,4 +461,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f" content-hash = "9b77e1a09fcf2cf5e7e6be53f304cd21a6a51ea51680d661a178afe5e5343670"

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.2.15" version = "0.2.16"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"
@ -14,6 +14,7 @@ python = "^3.10"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.0" pandas = "^2.1.0"
openpyxl = "^3.1.2" openpyxl = "^3.1.2"
tls-client = "^0.2.2"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@ -4,11 +4,13 @@ from homeharvest.exceptions import (
InvalidListingType, InvalidListingType,
NoResultsFound, NoResultsFound,
GeoCoordsNotFound, GeoCoordsNotFound,
SearchTooBroad,
) )
def test_redfin(): def test_redfin():
results = [ results = [
scrape_property(location="San Diego", site_name="redfin", listing_type="for_sale"),
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"), scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"), scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"), scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
@ -24,9 +26,10 @@ def test_redfin():
location="abceefg ju098ot498hh9", location="abceefg ju098ot498hh9",
site_name="redfin", site_name="redfin",
listing_type="for_sale", listing_type="for_sale",
) ),
scrape_property(location="Florida", site_name="redfin", listing_type="for_rent"),
] ]
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound): except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound, SearchTooBroad):
assert True assert True
assert all([result is None for result in bad_results]) assert all([result is None for result in bad_results])