Compare commits

..

2 Commits

Author SHA1 Message Date
Cullen
be20258535 fix: redfin 2024-04-04 17:05:41 -05:00
Cullen
d05bc5d79f fix: redfin 2024-04-04 17:05:00 -05:00
5 changed files with 24 additions and 50 deletions

11
example.py Normal file
View File

@@ -0,0 +1,11 @@
from homeharvest import scrape_property
import pandas as pd
properties: pd.DataFrame = scrape_property(
site_name=["redfin"],
location="85281",
listing_type="for_rent" # for_sale / sold
)
print(properties)
properties.to_csv('properties.csv', index=False)

View File

@@ -17,6 +17,7 @@ class Scraper:
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.session = requests.Session() self.session = requests.Session()
self.session.headers.update({"user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'})
if scraper_input.proxy: if scraper_input.proxy:
proxy_url = scraper_input.proxy proxy_url = scraper_input.proxy
proxies = {"http": proxy_url, "https": proxy_url} proxies = {"http": proxy_url, "https": proxy_url}

View File

@@ -6,25 +6,16 @@ This module implements the scraper for zillow.com
""" """
import re import re
import json import json
import tls_client
from .. import Scraper from .. import Scraper
from requests.exceptions import HTTPError
from ....utils import parse_address_one, parse_address_two from ....utils import parse_address_one, parse_address_two
from ....exceptions import GeoCoordsNotFound, NoResultsFound from ....exceptions import GeoCoordsNotFound, NoResultsFound
from ..models import Property, Address, ListingType, PropertyType, Agent from ..models import Property, Address, ListingType, PropertyType, Agent
import urllib.parse
from datetime import datetime, timedelta
class ZillowScraper(Scraper): class ZillowScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
self.cookies = None self.cookies = None
self.session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
if not self.is_plausible_location(self.location): if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location)) raise NoResultsFound("Invalid location input: {}".format(self.location))
@@ -41,18 +32,15 @@ class ZillowScraper(Scraper):
url = ( url = (
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={" "https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render" "}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
).format(urllib.parse.quote(location)) ).format(location)
resp = self.session.get(url) response = self.session.get(url)
return resp.json()["results"] != [] return response.json()["results"] != []
def search(self): def search(self):
resp = self.session.get(self.url, headers=self._get_headers()) resp = self.session.get(self.url, headers=self._get_headers())
if resp.status_code != 200: resp.raise_for_status()
raise HTTPError(
f"bad response status code: {resp.status_code}"
)
content = resp.text content = resp.text
match = re.search( match = re.search(
@@ -147,23 +135,11 @@ class ZillowScraper(Scraper):
"isDebugRequest": False, "isDebugRequest": False,
} }
resp = self.session.put(url, headers=self._get_headers(), json=payload) resp = self.session.put(url, headers=self._get_headers(), json=payload)
if resp.status_code != 200: resp.raise_for_status()
raise HTTPError(
f"bad response status code: {resp.status_code}"
)
self.cookies = resp.cookies self.cookies = resp.cookies
a = resp.json()
return self._parse_properties(resp.json()) return self._parse_properties(resp.json())
@staticmethod
def parse_posted_time(time: str) -> datetime:
int_time = int(time.split(" ")[0])
if "hour" in time:
return datetime.now() - timedelta(hours=int_time)
if "day" in time:
return datetime.now() - timedelta(days=int_time)
def _parse_properties(self, property_data: dict): def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"] mapresults = property_data["cat1"]["searchResults"]["mapResults"]
@@ -189,7 +165,7 @@ class ZillowScraper(Scraper):
home_info["statusType"] if "statusType" in home_info else self.listing_type home_info["statusType"] if "statusType" in home_info else self.listing_type
), ),
status_text=result.get("statusText"), status_text=result.get("statusText"),
posted_time=self.parse_posted_time(result["variableData"]["text"]) posted_time=result["variableData"]["text"] #: TODO: change to datetime
if "variableData" in result if "variableData" in result
and "text" in result["variableData"] and "text" in result["variableData"]
and result["variableData"]["type"] == "TIME_ON_INFO" and result["variableData"]["type"] == "TIME_ON_INFO"
@@ -325,16 +301,14 @@ class ZillowScraper(Scraper):
def _get_headers(self): def _get_headers(self):
headers = { headers = {
'authority': 'www.zillow.com', 'authority': 'www.zillow.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9', 'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0', 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'cookie': '<your_cookie_here>',
'sec-ch-ua': '"Chromium";v="117", "Not)A;Brand";v="24", "Google Chrome";v="117"',
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"', 'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document', 'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate', 'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin', 'sec-fetch-site': 'none',
'sec-fetch-user': '?1', 'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1', 'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',

13
poetry.lock generated
View File

@@ -408,17 +408,6 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
] ]
[[package]]
name = "tls-client"
version = "0.2.2"
description = "Advanced Python HTTP Client."
optional = false
python-versions = "*"
files = [
{file = "tls_client-0.2.2-py3-none-any.whl", hash = "sha256:30934871397cdad6862e00b5634f382666314a452ddd3d774e18323a0ad9b765"},
{file = "tls_client-0.2.2.tar.gz", hash = "sha256:78bc0e291e3aadc6c5e903b62bb26c01374577691f2a9e5e17899900a5927a13"},
]
[[package]] [[package]]
name = "tomli" name = "tomli"
version = "2.0.1" version = "2.0.1"
@@ -461,4 +450,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "9b77e1a09fcf2cf5e7e6be53f304cd21a6a51ea51680d661a178afe5e5343670" content-hash = "3647d568f5623dd762f19029230626a62e68309fa2ef8be49a36382c19264a5f"

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.2.16" version = "0.2.15"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/ZacharyHampton/HomeHarvest"
@@ -14,7 +14,6 @@ python = "^3.10"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.0" pandas = "^2.1.0"
openpyxl = "^3.1.2" openpyxl = "^3.1.2"
tls-client = "^0.2.2"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]