Compare commits

..

2 Commits

Author SHA1 Message Date
Cullen
be20258535 fix: redfin 2024-04-04 17:05:41 -05:00
Cullen
d05bc5d79f fix: redfin 2024-04-04 17:05:00 -05:00
6 changed files with 49 additions and 74 deletions

11
example.py Normal file
View File

@@ -0,0 +1,11 @@
from homeharvest import scrape_property
import pandas as pd
properties: pd.DataFrame = scrape_property(
site_name=["redfin"],
location="85281",
listing_type="for_rent" # for_sale / sold
)
print(properties)
properties.to_csv('properties.csv', index=False)

View File

@@ -1,6 +1,5 @@
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
import tls_client
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName
@@ -13,20 +12,16 @@ class ScraperInput:
class Scraper: class Scraper:
def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_client.Session = None): def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
if not session: self.session = requests.Session()
self.session = requests.Session() self.session.headers.update({"user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'})
else:
self.session = session
if scraper_input.proxy: if scraper_input.proxy:
proxy_url = scraper_input.proxy proxy_url = scraper_input.proxy
proxies = {"http": proxy_url, "https": proxy_url} proxies = {"http": proxy_url, "https": proxy_url}
self.session.proxies.update(proxies) self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name

View File

@@ -6,41 +6,16 @@ This module implements the scraper for zillow.com
""" """
import re import re
import json import json
import tls_client
from .. import Scraper from .. import Scraper
from requests.exceptions import HTTPError
from ....utils import parse_address_one, parse_address_two from ....utils import parse_address_one, parse_address_two
from ....exceptions import GeoCoordsNotFound, NoResultsFound from ....exceptions import GeoCoordsNotFound, NoResultsFound
from ..models import Property, Address, ListingType, PropertyType, Agent from ..models import Property, Address, ListingType, PropertyType, Agent
import urllib.parse
from datetime import datetime, timedelta
class ZillowScraper(Scraper): class ZillowScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
session = tls_client.Session( super().__init__(scraper_input)
client_identifier="chrome112", random_tls_extension_order=True self.cookies = None
)
super().__init__(scraper_input, session)
self.session.headers.update({
'authority': 'www.zillow.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="117", "Not)A;Brand";v="24", "Google Chrome";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
})
if not self.is_plausible_location(self.location): if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location)) raise NoResultsFound("Invalid location input: {}".format(self.location))
@@ -57,18 +32,15 @@ class ZillowScraper(Scraper):
url = ( url = (
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={" "https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render" "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
).format(urllib.parse.quote(location)) ).format(location)
resp = self.session.get(url) response = self.session.get(url)
return resp.json()["results"] != [] return response.json()["results"] != []
def search(self): def search(self):
resp = self.session.get(self.url) resp = self.session.get(self.url, headers=self._get_headers())
if resp.status_code != 200: resp.raise_for_status()
raise HTTPError(
f"bad response status code: {resp.status_code}"
)
content = resp.text content = resp.text
match = re.search( match = re.search(
@@ -162,23 +134,12 @@ class ZillowScraper(Scraper):
"wants": {"cat1": ["mapResults"]}, "wants": {"cat1": ["mapResults"]},
"isDebugRequest": False, "isDebugRequest": False,
} }
resp = self.session.put(url, json=payload) resp = self.session.put(url, headers=self._get_headers(), json=payload)
if resp.status_code != 200: resp.raise_for_status()
raise HTTPError( self.cookies = resp.cookies
f"bad response status code: {resp.status_code}" a = resp.json()
)
return self._parse_properties(resp.json()) return self._parse_properties(resp.json())
@staticmethod
def parse_posted_time(time: str) -> datetime:
int_time = int(time.split(" ")[0])
if "hour" in time:
return datetime.now() - timedelta(hours=int_time)
if "day" in time:
return datetime.now() - timedelta(days=int_time)
def _parse_properties(self, property_data: dict): def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"] mapresults = property_data["cat1"]["searchResults"]["mapResults"]
@@ -204,7 +165,7 @@ class ZillowScraper(Scraper):
home_info["statusType"] if "statusType" in home_info else self.listing_type home_info["statusType"] if "statusType" in home_info else self.listing_type
), ),
status_text=result.get("statusText"), status_text=result.get("statusText"),
posted_time=self.parse_posted_time(result["variableData"]["text"]) posted_time=result["variableData"]["text"] #: TODO: change to datetime
if "variableData" in result if "variableData" in result
and "text" in result["variableData"] and "text" in result["variableData"]
and result["variableData"]["type"] == "TIME_ON_INFO" and result["variableData"]["type"] == "TIME_ON_INFO"
@@ -336,3 +297,24 @@ class ZillowScraper(Scraper):
state=state, state=state,
zip_code=zip_code, zip_code=zip_code,
) )
def _get_headers(self):
headers = {
'authority': 'www.zillow.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
}
if self.cookies:
headers['Cookie'] = self.cookies
return headers

13
poetry.lock generated
View File

@@ -408,17 +408,6 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
] ]
[[package]]
name = "tls-client"
version = "0.2.2"
description = "Advanced Python HTTP Client."
optional = false
python-versions = "*"
files = [
{file = "tls_client-0.2.2-py3-none-any.whl", hash = "sha256:30934871397cdad6862e00b5634f382666314a452ddd3d774e18323a0ad9b765"},
{file = "tls_client-0.2.2.tar.gz", hash = "sha256:78bc0e291e3aadc6c5e903b62bb26c01374577691f2a9e5e17899900a5927a13"},
]
[[package]] [[package]]
name = "tomli" name = "tomli"
version = "2.0.1" version = "2.0.1"
@@ -461,4 +450,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "9b77e1a09fcf2cf5e7e6be53f304cd21a6a51ea51680d661a178afe5e5343670" content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f"

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.2.18" version = "0.2.15"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"
@@ -14,7 +14,6 @@ python = "^3.10"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.0" pandas = "^2.1.0"
openpyxl = "^3.1.2" openpyxl = "^3.1.2"
tls-client = "^0.2.2"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@@ -11,7 +11,6 @@ def test_zillow():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"), scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"),
scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"), scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"),
scrape_property(location="Surprise, AZ", site_name=["zillow"], listing_type="for_sale"),
scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"), scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"),
scrape_property(location="85281", site_name="zillow"), scrape_property(location="85281", site_name="zillow"),
scrape_property(location="3268 88th st s, Lakewood", site_name="zillow", listing_type="for_rent"), scrape_property(location="3268 88th st s, Lakewood", site_name="zillow", listing_type="for_rent"),