refactor(redfin): fit to use updated models

pull/1/head
Cullen Watson 2023-09-18 14:07:37 -05:00
parent dc8c15959f
commit 471e53118e
4 changed files with 29 additions and 20 deletions

View File

@ -53,7 +53,7 @@ class Address:
city: str city: str
state: str state: str
zip_code: str zip_code: str
unit: str unit: str | None = None
country: str | None = None country: str | None = None

View File

@ -1,7 +1,8 @@
import json import json
from ..models import Property, Address, PropertyType
from .. import Scraper
from typing import Any from typing import Any
from .. import Scraper
from ....utils import parse_address_two
from ..models import Property, Address, PropertyType
class RedfinScraper(Scraper): class RedfinScraper(Scraper):
@ -38,20 +39,26 @@ class RedfinScraper(Scraper):
return home[key]["value"] return home[key]["value"]
if not single_search: if not single_search:
unit = parse_address_two(get_value("streetLine"))
address = Address( address = Address(
address_one=get_value("streetLine"), street_address=get_value("streetLine"),
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
unit=unit,
country="USA",
) )
else: else:
address_info = home["streetAddress"] address_info = home["streetAddress"]
unit = parse_address_two(address_info["assembledAddress"])
address = Address( address = Address(
address_one=address_info["assembledAddress"], street_address=address_info["assembledAddress"],
city=home["city"], city=home["city"],
state=home["state"], state=home["state"],
zip_code=home["zip"], zip_code=home["zip"],
unit=unit,
country="USA",
) )
url = "https://www.redfin.com{}".format(home["url"]) url = "https://www.redfin.com{}".format(home["url"])
property_type = home["propertyType"] if "propertyType" in home else None property_type = home["propertyType"] if "propertyType" in home else None
@ -69,7 +76,7 @@ class RedfinScraper(Scraper):
site_name=self.site_name, site_name=self.site_name,
listing_type=self.listing_type, listing_type=self.listing_type,
address=address, address=address,
url=url, property_url=url,
beds=home["beds"] if "beds" in home else None, beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None, baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None, stories=home["stories"] if "stories" in home else None,
@ -79,9 +86,9 @@ class RedfinScraper(Scraper):
if not single_search if not single_search
else home["yearBuilt"], else home["yearBuilt"],
square_feet=get_value("sqFt"), square_feet=get_value("sqFt"),
lot_size=lot_size, lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")), property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_square_foot=get_value("pricePerSqFt"), price_per_sqft=get_value("pricePerSqFt"),
price=get_value("price"), price=get_value("price"),
mls_id=get_value("mlsId"), mls_id=get_value("mlsId"),
) )

View File

@ -1,8 +1,9 @@
import re import re
import json import json
from ..models import Property, Address, ListingType, PropertyType, SiteName
from ....exceptions import NoResultsFound, PropertyNotFound
from .. import Scraper from .. import Scraper
from ....utils import parse_address_two
from ....exceptions import NoResultsFound, PropertyNotFound
from ..models import Property, Address, ListingType, PropertyType, SiteName
class ZillowScraper(Scraper): class ZillowScraper(Scraper):
@ -120,7 +121,7 @@ class ZillowScraper(Scraper):
resp = self.session.put(url, headers=self._get_headers(), data=payload) resp = self.session.put(url, headers=self._get_headers(), data=payload)
resp.raise_for_status() resp.raise_for_status()
a = resp.json() a = resp.json()
return self._parse_properties(resp.json()) return parse_properties(resp.json())
def _parse_properties(self, property_data: dict): def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"] mapresults = property_data["cat1"]["searchResults"]["mapResults"]
@ -249,7 +250,7 @@ class ZillowScraper(Scraper):
else property_data["hdpUrl"] else property_data["hdpUrl"]
) )
address_data = property_data["address"] address_data = property_data["address"]
unit = self._parse_address_two(address_data["streetAddress"]) unit = parse_address_two(address_data["streetAddress"])
address = Address( address = Address(
street_address=address_data["streetAddress"], street_address=address_data["streetAddress"],
unit=unit, unit=unit,
@ -288,11 +289,6 @@ class ZillowScraper(Scraper):
listing_type=self.listing_type, listing_type=self.listing_type,
) )
@staticmethod
def _parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
return apt_match.group().strip() if apt_match else None
def _extract_address(self, address_str): def _extract_address(self, address_str):
""" """
Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX', Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
@ -309,14 +305,14 @@ class ZillowScraper(Scraper):
if len(state_zip) == 1: if len(state_zip) == 1:
state = state_zip[0].strip() state = state_zip[0].strip()
zip_code = None zip_code = None
elif len(state_zip) == 2: elif len(state_zip) == 2:
state = state_zip[0].strip() state = state_zip[0].strip()
zip_code = state_zip[1].strip() zip_code = state_zip[1].strip()
else: else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}") raise ValueError(f"Unexpected state/zip format in address: {address_str}")
unit = self._parse_address_two(street_address) unit = parse_address_two(street_address)
return Address( return Address(
street_address=street_address, street_address=street_address,
city=city, city=city,
@ -335,7 +331,7 @@ class ZillowScraper(Scraper):
"content-type": "application/json", "content-type": "application/json",
"cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09', "cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
"origin": "https://www.zillow.com", "origin": "https://www.zillow.com",
"referer": "https://www.zillow.com/homes/Dallas,-TX_rb/", "referer": "https://www.zillow.com",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0", "sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"', "sec-ch-ua-platform": '"Windows"',

6
homeharvest/utils.py Normal file
View File

@ -0,0 +1,6 @@
import re
def parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
return apt_match.group().strip() if apt_match else None