From 73b6d5b33f669eafafb2739a27586806f5b57225 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Thu, 28 Sep 2023 19:34:01 -0500 Subject: [PATCH] [fix] zilow tls client --- homeharvest/core/scrapers/zillow/__init__.py | 30 ++++++++++++++------ poetry.lock | 13 ++++++++- pyproject.toml | 1 + 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 5911e98..db1a106 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -6,7 +6,11 @@ This module implements the scraper for zillow.com """ import re import json + +import tls_client + from .. import Scraper +from requests.exceptions import HTTPError from ....utils import parse_address_one, parse_address_two from ....exceptions import GeoCoordsNotFound, NoResultsFound from ..models import Property, Address, ListingType, PropertyType, Agent @@ -16,6 +20,9 @@ class ZillowScraper(Scraper): def __init__(self, scraper_input): super().__init__(scraper_input) self.cookies = None + self.session = tls_client.Session( + client_identifier="chrome112", random_tls_extension_order=True + ) if not self.is_plausible_location(self.location): raise NoResultsFound("Invalid location input: {}".format(self.location)) @@ -34,13 +41,16 @@ class ZillowScraper(Scraper): "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render" ).format(location) - response = self.session.get(url) + resp = self.session.get(url) - return response.json()["results"] != [] + return resp.json()["results"] != [] def search(self): resp = self.session.get(self.url, headers=self._get_headers()) - resp.raise_for_status() + if resp.status_code != 200: + raise HTTPError( + f"bad response status code: {resp.status_code}" + ) content = resp.text match = re.search( @@ -135,9 +145,11 @@ class ZillowScraper(Scraper): "isDebugRequest": False, } resp = self.session.put(url, headers=self._get_headers(), json=payload) - resp.raise_for_status() + if resp.status_code != 200: + raise HTTPError( + f"bad response status code: {resp.status_code}" + ) self.cookies = resp.cookies - a = resp.json() return self._parse_properties(resp.json()) def _parse_properties(self, property_data: dict): @@ -301,14 +313,16 @@ class ZillowScraper(Scraper): def _get_headers(self): headers = { 'authority': 'www.zillow.com', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-language': 'en-US,en;q=0.9', - 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', + 'cache-control': 'max-age=0', + 'cookie': '', + 'sec-ch-ua': '"Chromium";v="117", "Not)A;Brand";v="24", "Google Chrome";v="117"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', - 'sec-fetch-site': 'none', + 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', diff --git a/poetry.lock b/poetry.lock index d1ac3b1..32118ab 100644 --- a/poetry.lock +++ b/poetry.lock @@ -408,6 +408,17 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "tls-client" +version = "0.2.2" +description = "Advanced Python HTTP Client." +optional = false +python-versions = "*" +files = [ + {file = "tls_client-0.2.2-py3-none-any.whl", hash = "sha256:30934871397cdad6862e00b5634f382666314a452ddd3d774e18323a0ad9b765"}, + {file = "tls_client-0.2.2.tar.gz", hash = "sha256:78bc0e291e3aadc6c5e903b62bb26c01374577691f2a9e5e17899900a5927a13"}, +] + [[package]] name = "tomli" version = "2.0.1" @@ -450,4 +461,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f" +content-hash = "9b77e1a09fcf2cf5e7e6be53f304cd21a6a51ea51680d661a178afe5e5343670" diff --git a/pyproject.toml b/pyproject.toml index 3cb9309..adb721c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ python = "^3.10" requests = "^2.31.0" pandas = "^2.1.0" openpyxl = "^3.1.2" +tls-client = "^0.2.2" [tool.poetry.group.dev.dependencies]