refactor(redfin): fit to use updated models

pull/1/head
Cullen Watson 2023-09-18 14:07:37 -05:00
parent dc8c15959f
commit 471e53118e
4 changed files with 29 additions and 20 deletions

View File

@ -53,7 +53,7 @@ class Address:
city: str
state: str
zip_code: str
unit: str
unit: str | None = None
country: str | None = None

View File

@ -1,7 +1,8 @@
import json
from ..models import Property, Address, PropertyType
from .. import Scraper
from typing import Any
from .. import Scraper
from ....utils import parse_address_two
from ..models import Property, Address, PropertyType
class RedfinScraper(Scraper):
@ -38,20 +39,26 @@ class RedfinScraper(Scraper):
return home[key]["value"]
if not single_search:
unit = parse_address_two(get_value("streetLine"))
address = Address(
address_one=get_value("streetLine"),
street_address=get_value("streetLine"),
city=home["city"],
state=home["state"],
zip_code=home["zip"],
unit=unit,
country="USA",
)
else:
address_info = home["streetAddress"]
unit = parse_address_two(address_info["assembledAddress"])
address = Address(
address_one=address_info["assembledAddress"],
street_address=address_info["assembledAddress"],
city=home["city"],
state=home["state"],
zip_code=home["zip"],
unit=unit,
country="USA",
)
url = "https://www.redfin.com{}".format(home["url"])
property_type = home["propertyType"] if "propertyType" in home else None
@ -69,7 +76,7 @@ class RedfinScraper(Scraper):
site_name=self.site_name,
listing_type=self.listing_type,
address=address,
url=url,
property_url=url,
beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None,
@ -79,9 +86,9 @@ class RedfinScraper(Scraper):
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
lot_size=lot_size,
lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_square_foot=get_value("pricePerSqFt"),
price_per_sqft=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
)

View File

@ -1,8 +1,9 @@
import re
import json
from ..models import Property, Address, ListingType, PropertyType, SiteName
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper
from ....utils import parse_address_two
from ....exceptions import NoResultsFound, PropertyNotFound
from ..models import Property, Address, ListingType, PropertyType, SiteName
class ZillowScraper(Scraper):
@ -120,7 +121,7 @@ class ZillowScraper(Scraper):
resp = self.session.put(url, headers=self._get_headers(), data=payload)
resp.raise_for_status()
a = resp.json()
return self._parse_properties(resp.json())
return parse_properties(resp.json())
def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
@ -249,7 +250,7 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"]
)
address_data = property_data["address"]
unit = self._parse_address_two(address_data["streetAddress"])
unit = parse_address_two(address_data["streetAddress"])
address = Address(
street_address=address_data["streetAddress"],
unit=unit,
@ -288,11 +289,6 @@ class ZillowScraper(Scraper):
listing_type=self.listing_type,
)
@staticmethod
def _parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
return apt_match.group().strip() if apt_match else None
def _extract_address(self, address_str):
"""
Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
@ -316,7 +312,7 @@ class ZillowScraper(Scraper):
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
unit = self._parse_address_two(street_address)
unit = parse_address_two(street_address)
return Address(
street_address=street_address,
city=city,
@ -335,7 +331,7 @@ class ZillowScraper(Scraper):
"content-type": "application/json",
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
"origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/homes/Dallas,-TX_rb/",
"referer": "https://www.zillow.com",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',

6
homeharvest/utils.py Normal file
View File

@ -0,0 +1,6 @@
import re
def parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
return apt_match.group().strip() if apt_match else None