fix: redfin

2026-03-05 03:54:29 -08:00 · 2024-04-04 17:05:41 -05:00 · 2024-04-04 17:05:00 -05:00
6 changed files with 49 additions and 74 deletions
--- a/example.py
+++ b/example.py
@@ -0,0 +1,11 @@
 from homeharvest import scrape_property
 import pandas as pd
 properties: pd.DataFrame = scrape_property(
    site_name=["redfin"],
    location="85281",
    listing_type="for_rent" # for_sale / sold
 )
 print(properties)
 properties.to_csv('properties.csv', index=False)
--- a/homeharvest/core/scrapers/init.py
+++ b/homeharvest/core/scrapers/init.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
 import requests
 import tls_client
 from .models import Property, ListingType, SiteName
@@ -13,20 +12,16 @@ class ScraperInput:
 class Scraper:
-    def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_client.Session = None):
+    def __init__(self, scraper_input: ScraperInput):
        self.location = scraper_input.location
        self.listing_type = scraper_input.listing_type
-        if not session:
+        self.session = requests.Session()
-            self.session = requests.Session()
+        self.session.headers.update({"user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'})
        else:
            self.session = session
        if scraper_input.proxy:
            proxy_url = scraper_input.proxy
            proxies = {"http": proxy_url, "https": proxy_url}
            self.session.proxies.update(proxies)
        self.listing_type = scraper_input.listing_type
        self.site_name = scraper_input.site_name
--- a/homeharvest/core/scrapers/zillow/init.py
+++ b/homeharvest/core/scrapers/zillow/init.py
@@ -6,41 +6,16 @@ This module implements the scraper for zillow.com
 """
 import re
 import json
 import tls_client
 from .. import Scraper
 from requests.exceptions import HTTPError
 from ....utils import parse_address_one, parse_address_two
 from ....exceptions import GeoCoordsNotFound, NoResultsFound
 from ..models import Property, Address, ListingType, PropertyType, Agent
 import urllib.parse
 from datetime import datetime, timedelta
 class ZillowScraper(Scraper):
    def __init__(self, scraper_input):
-        session = tls_client.Session(
+        super().__init__(scraper_input)
-            client_identifier="chrome112", random_tls_extension_order=True
+        self.cookies = None
        )
        super().__init__(scraper_input, session)
        self.session.headers.update({
            'authority': 'www.zillow.com',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-language': 'en-US,en;q=0.9',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Chromium";v="117", "Not)A;Brand";v="24", "Google Chrome";v="117"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        })
        if not self.is_plausible_location(self.location):
            raise NoResultsFound("Invalid location input: {}".format(self.location))
@@ -57,18 +32,15 @@ class ZillowScraper(Scraper):
        url = (
            "https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
            "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
-        ).format(urllib.parse.quote(location))
+        ).format(location)
-        resp = self.session.get(url)
+        response = self.session.get(url)
-        return resp.json()["results"] != []
+        return response.json()["results"] != []
    def search(self):
-        resp = self.session.get(self.url)
+        resp = self.session.get(self.url, headers=self._get_headers())
-        if resp.status_code != 200:
+        resp.raise_for_status()
            raise HTTPError(
                f"bad response status code: {resp.status_code}"
            )
        content = resp.text
        match = re.search(
@@ -162,23 +134,12 @@ class ZillowScraper(Scraper):
            "wants": {"cat1": ["mapResults"]},
            "isDebugRequest": False,
        }
-        resp = self.session.put(url, json=payload)
+        resp = self.session.put(url, headers=self._get_headers(), json=payload)
-        if resp.status_code != 200:
+        resp.raise_for_status()
-            raise HTTPError(
+        self.cookies = resp.cookies
-                f"bad response status code: {resp.status_code}"
+        a = resp.json()
            )
        return self._parse_properties(resp.json())
    @staticmethod
    def parse_posted_time(time: str) -> datetime:
        int_time = int(time.split(" ")[0])
        if "hour" in time:
            return datetime.now() - timedelta(hours=int_time)
        if "day" in time:
            return datetime.now() - timedelta(days=int_time)
    def _parse_properties(self, property_data: dict):
        mapresults = property_data["cat1"]["searchResults"]["mapResults"]
@@ -204,7 +165,7 @@ class ZillowScraper(Scraper):
                        home_info["statusType"] if "statusType" in home_info else self.listing_type
                    ),
                    status_text=result.get("statusText"),
-                    posted_time=self.parse_posted_time(result["variableData"]["text"])
+                    posted_time=result["variableData"]["text"]  #: TODO: change to datetime
                    if "variableData" in result
                       and "text" in result["variableData"]
                       and result["variableData"]["type"] == "TIME_ON_INFO"
@@ -336,3 +297,24 @@ class ZillowScraper(Scraper):
            state=state,
            zip_code=zip_code,
        )
    def _get_headers(self):
        headers = {
            'authority': 'www.zillow.com',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'en-US,en;q=0.9',
            'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'none',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        }
        if self.cookies:
            headers['Cookie'] = self.cookies
        return headers
--- a/poetry.lock
+++ b/poetry.lock
@@ -408,17 +408,6 @@ files = [
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 [[package]]
 name = "tls-client"
 version = "0.2.2"
 description = "Advanced Python HTTP Client."
 optional = false
 python-versions = "*"
 files = [
    {file = "tls_client-0.2.2-py3-none-any.whl", hash = "sha256:30934871397cdad6862e00b5634f382666314a452ddd3d774e18323a0ad9b765"},
    {file = "tls_client-0.2.2.tar.gz", hash = "sha256:78bc0e291e3aadc6c5e903b62bb26c01374577691f2a9e5e17899900a5927a13"},
 ]
 [[package]]
 name = "tomli"
 version = "2.0.1"
@@ -461,4 +450,4 @@ zstd = ["zstandard (>=0.18.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "9b77e1a09fcf2cf5e7e6be53f304cd21a6a51ea51680d661a178afe5e5343670"
+content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "homeharvest"
-version = "0.2.18"
+version = "0.2.15"
 description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
 authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
 homepage = "https://github.com/ZacharyHampton/HomeHarvest"
@@ -14,7 +14,6 @@ python = "^3.10"
 requests = "^2.31.0"
 pandas = "^2.1.0"
 openpyxl = "^3.1.2"
 tls-client = "^0.2.2"
 [tool.poetry.group.dev.dependencies]
--- a/tests/test_zillow.py
+++ b/tests/test_zillow.py
@@ -11,7 +11,6 @@ def test_zillow():
    results = [
        scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"),
        scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"),
        scrape_property(location="Surprise, AZ", site_name=["zillow"], listing_type="for_sale"),
        scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"),
        scrape_property(location="85281", site_name="zillow"),
        scrape_property(location="3268 88th st s, Lakewood", site_name="zillow", listing_type="for_rent"),
Author	SHA1	Message	Date
Cullen	be20258535	fix: redfin	2024-04-04 17:05:41 -05:00
Cullen	d05bc5d79f	fix: redfin	2024-04-04 17:05:00 -05:00