refator(realtor): fit to updated models

pull/1/head
Cullen Watson 2023-09-18 15:43:44 -05:00
parent ffd3ce6aed
commit 869d7e7c51
8 changed files with 163 additions and 109 deletions

View File

@ -70,7 +70,10 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["site_name"] = prop_data["site_name"].value prop_data["site_name"] = prop_data["site_name"].value
prop_data["listing_type"] = prop_data["listing_type"].value.lower() prop_data["listing_type"] = prop_data["listing_type"].value.lower()
prop_data["property_type"] = prop_data["property_type"].value.lower() if "property_type" in prop_data and prop_data["property_type"] is not None:
prop_data["property_type"] = prop_data["property_type"].value.lower()
else:
prop_data["property_type"] = None
if "address" in prop_data: if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["street_address"] = address_data.street_address prop_data["street_address"] = address_data.street_address
@ -108,7 +111,7 @@ def scrape_property(
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType[listing_type.upper()], listing_type=ListingType[listing_type.upper()],
site_name=SiteName[site_name.upper()], site_name=SiteName.get_by_value(site_name.lower()),
) )
site = _scrapers[site_name.lower()](scraper_input) site = _scrapers[site_name.lower()](scraper_input)

View File

@ -7,6 +7,13 @@ class SiteName(Enum):
REDFIN = "redfin" REDFIN = "redfin"
REALTOR = "realtor.com" REALTOR = "realtor.com"
@classmethod
def get_by_value(cls, value):
for item in cls:
if item.value == value:
return item
raise ValueError(f"{value} not found in {cls}")
class ListingType(Enum): class ListingType(Enum):
FOR_SALE = "FOR_SALE" FOR_SALE = "FOR_SALE"
@ -57,14 +64,13 @@ class Address:
country: str | None = None country: str | None = None
@dataclass @dataclass
class Property: class Property:
property_url: str property_url: str
site_name: SiteName site_name: SiteName
listing_type: ListingType listing_type: ListingType
property_type: PropertyType
address: Address address: Address
property_type: PropertyType | None = None
# house for sale # house for sale
price: int | None = None price: int | None = None
@ -78,7 +84,6 @@ class Property:
stories: int | None = None stories: int | None = None
year_built: int | None = None year_built: int | None = None
price_per_sqft: int | None = None price_per_sqft: int | None = None
year_built: int | None = None
mls_id: str | None = None mls_id: str | None = None
agent_name: str | None = None agent_name: str | None = None

View File

@ -3,6 +3,7 @@ from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any, Generator from typing import Any, Generator
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
from ....utils import parse_address_two
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -29,7 +30,7 @@ class RealtorScraper(Scraper):
params = { params = {
"input": self.location, "input": self.location,
"client_id": self.listing_type.value.replace('_', '-'), "client_id": self.listing_type.value.lower().replace("_", "-"),
"limit": "1", "limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
} }
@ -96,46 +97,57 @@ class RealtorScraper(Scraper):
} }
}""" }"""
variables = { variables = {"property_id": property_id}
'property_id': property_id
}
payload = { payload = {
'query': query, "query": query,
'variables': variables, "variables": variables,
} }
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.search_url, json=payload)
response_json = response.json() response_json = response.json()
property_info = response_json['data']['property'] property_info = response_json["data"]["property"]
street_address = property_info["address"]["line"]
unit = parse_address_two(street_address)
return [Property( return [
site_name=self.site_name, Property(
address=Address( site_name=self.site_name,
address_one=property_info['address']['line'], address=Address(
city=property_info['address']['city'], street_address=street_address,
state=property_info['address']['state_code'], city=property_info["address"]["city"],
zip_code=property_info['address']['postal_code'], state=property_info["address"]["state_code"],
), zip_code=property_info["address"]["postal_code"],
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'], unit=unit,
beds=property_info['basic']['beds'], country="USA",
baths=property_info['basic']['baths'], ),
stories=property_info['details']['stories'], property_url="https://www.realtor.com/realestateandhomes-detail/"
year_built=property_info['details']['year_built'], + property_info["details"]["permalink"],
square_feet=property_info['basic']['sqft'], beds=property_info["basic"]["beds"],
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft'] baths=property_info["basic"]["baths"],
if property_info['basic']['sqft'] is not None and stories=property_info["details"]["stories"],
property_info['basic']['price'] is not None year_built=property_info["details"]["year_built"],
else None, square_feet=property_info["basic"]["sqft"],
price=property_info['basic']['price'], price_per_sqft=property_info["basic"]["price"]
mls_id=property_id, // property_info["basic"]["sqft"]
listing_type=self.listing_type, if property_info["basic"]["sqft"] is not None
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None, and property_info["basic"]["price"] is not None
)] else None,
price=property_info["basic"]["price"],
mls_id=property_id,
listing_type=self.listing_type,
lot_area_value=property_info["public_record"]["lot_size"]
if property_info["public_record"] is not None
else None,
)
]
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: def handle_area(
query = """query Home_search( self, variables: dict, return_total: bool = False
) -> list[Property] | int:
query = (
"""query Home_search(
$city: String, $city: String,
$county: [String], $county: [String],
$state_code: String, $state_code: String,
@ -193,42 +205,57 @@ class RealtorScraper(Scraper):
} }
} }
} }
}""" % self.listing_type.value }"""
% self.listing_type.value.lower()
)
payload = { payload = {
'query': query, "query": query,
'variables': variables, "variables": variables,
} }
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.search_url, json=payload)
response.raise_for_status()
response_json = response.json() response_json = response.json()
if return_total: if return_total:
return response_json['data']['home_search']['total'] return response_json["data"]["home_search"]["total"]
properties: list[Property] = [] properties: list[Property] = []
for result in response_json['data']['home_search']['results']: if (
response_json is None
or "data" not in response_json
or response_json["data"] is None
or "home_search" not in response_json["data"]
or response_json["data"]["home_search"] is None
or "results" not in response_json["data"]["home_search"]
):
return []
for result in response_json["data"]["home_search"]["results"]:
realty_property = Property( realty_property = Property(
address=Address( address=Address(
address_one=result['location']['address']['line'], street_address=result["location"]["address"]["line"],
city=result['location']['address']['city'], city=result["location"]["address"]["city"],
state=result['location']['address']['state_code'], state=result["location"]["address"]["state_code"],
zip_code=result['location']['address']['postal_code'], zip_code=result["location"]["address"]["postal_code"],
address_two=result['location']['address']['unit'], unit=result["location"]["address"]["unit"],
country="USA",
), ),
site_name=self.site_name, site_name=self.site_name,
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'], property_url="https://www.realtor.com/realestateandhomes-detail/"
beds=result['description']['beds'], + result["property_id"],
baths=result['description']['baths'], beds=result["description"]["beds"],
stories=result['description']['stories'], baths=result["description"]["baths"],
year_built=result['description']['year_built'], stories=result["description"]["stories"],
square_feet=result['description']['sqft'], year_built=result["description"]["year_built"],
price_per_square_foot=result['price_per_sqft'], square_feet=result["description"]["sqft"],
price=result['list_price'], price_per_sqft=result["price_per_sqft"],
mls_id=result['property_id'], price=result["list_price"],
mls_id=result["property_id"],
listing_type=self.listing_type, listing_type=self.listing_type,
lot_size=result['description']['lot_sqft'], lot_area_value=result["description"]["lot_sqft"],
) )
properties.append(realty_property) properties.append(realty_property)
@ -239,17 +266,17 @@ class RealtorScraper(Scraper):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]
if location_type == 'address': if location_type == "address":
property_id = location_info['mpr_id'] property_id = location_info["mpr_id"]
return self.handle_address(property_id) return self.handle_address(property_id)
offset = 0 offset = 0
search_variables = { search_variables = {
'city': location_info.get('city'), "city": location_info.get("city"),
'county': location_info.get('county'), "county": location_info.get("county"),
'state_code': location_info.get('state_code'), "state_code": location_info.get("state_code"),
'postal_code': location_info.get('postal_code'), "postal_code": location_info.get("postal_code"),
'offset': offset, "offset": offset,
} }
total = self.handle_area(search_variables, return_total=True) total = self.handle_area(search_variables, return_total=True)
@ -258,8 +285,11 @@ class RealtorScraper(Scraper):
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures = [ futures = [
executor.submit( executor.submit(
self.handle_area, variables=search_variables | {'offset': i}, return_total=False self.handle_area,
) for i in range(0, total, 200) variables=search_variables | {"offset": i},
return_total=False,
)
for i in range(0, total, 200)
] ]
for future in as_completed(futures): for future in as_completed(futures):

View File

@ -100,28 +100,27 @@ class RedfinScraper(Scraper):
address=Address( address=Address(
street_address=" ".join( street_address=" ".join(
[ [
building['address']['streetNumber'], building["address"]["streetNumber"],
building['address']['directionalPrefix'], building["address"]["directionalPrefix"],
building['address']['streetName'], building["address"]["streetName"],
building['address']['streetType'], building["address"]["streetType"],
] ]
), ),
city=building['address']['city'], city=building["address"]["city"],
state=building['address']['stateOrProvinceCode'], state=building["address"]["stateOrProvinceCode"],
zip_code=building['address']['postalCode'], zip_code=building["address"]["postalCode"],
unit=" ".join( unit=" ".join(
[ [
building['address']['unitType'], building["address"]["unitType"],
building['address']['unitValue'], building["address"]["unitValue"],
] ]
) ),
), ),
property_url="https://www.redfin.com{}".format(building["url"]), property_url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type, listing_type=self.listing_type,
bldg_unit_count=building["numUnitsForSale"], bldg_unit_count=building["numUnitsForSale"],
) )
def handle_address(self, home_id: str): def handle_address(self, home_id: str):
""" """
EPs: EPs:
@ -160,7 +159,8 @@ class RedfinScraper(Scraper):
homes = [ homes = [
self._parse_home(home) for home in response_json["payload"]["homes"] self._parse_home(home) for home in response_json["payload"]["homes"]
] + [ ] + [
self._parse_building(building) for building in response_json["payload"]["buildings"].values() self._parse_building(building)
for building in response_json["payload"]["buildings"].values()
] ]
return homes return homes

View File

@ -98,26 +98,24 @@ class ZillowScraper(Scraper):
else filter_state_sold else filter_state_sold
) )
payload = json.dumps( payload = {
{ "searchQueryState": {
"searchQueryState": { "pagination": {},
"pagination": {}, "isMapVisible": True,
"isMapVisible": True, "mapBounds": {
"mapBounds": { "west": coords[0],
"west": coords[0], "east": coords[1],
"east": coords[1], "south": coords[2],
"south": coords[2], "north": coords[3],
"north": coords[3],
},
"filterState": selected_filter,
"isListVisible": True,
"mapZoom": 11,
}, },
"wants": {"cat1": ["mapResults"]}, "filterState": selected_filter,
"isDebugRequest": False, "isListVisible": True,
} "mapZoom": 11,
) },
resp = self.session.put(url, headers=self._get_headers(), data=payload) "wants": {"cat1": ["mapResults"]},
"isDebugRequest": False,
}
resp = self.session.put(url, headers=self._get_headers(), json=payload)
resp.raise_for_status() resp.raise_for_status()
a = resp.json() a = resp.json()
return self._parse_properties(resp.json()) return self._parse_properties(resp.json())
@ -176,9 +174,7 @@ class ZillowScraper(Scraper):
and result["variableData"]["type"] == "TIME_ON_INFO" and result["variableData"]["type"] == "TIME_ON_INFO"
else None, else None,
"img_src": result.get("imgSrc"), "img_src": result.get("imgSrc"),
"price_per_sqft": int( "price_per_sqft": int(home_info["price"] // home_info["livingArea"])
home_info["price"] // home_info["livingArea"]
)
if "livingArea" in home_info and "price" in home_info if "livingArea" in home_info and "price" in home_info
else None, else None,
} }

View File

@ -3,9 +3,17 @@ from homeharvest import scrape_property
def test_realtor(): def test_realtor():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"), scrape_property(
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format location="2530 Al Lipscomb Way",
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format site_name="realtor.com",
listing_type="for_sale",
),
scrape_property(
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent"
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
), #: does not support "city, state, USA" format
scrape_property(location="85281", site_name="realtor.com"), scrape_property(location="85281", site_name="realtor.com"),
] ]

View File

@ -3,9 +3,15 @@ from homeharvest import scrape_property
def test_redfin(): def test_redfin():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"), location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
scrape_property(location="Dallas, TX, USA", site_name="redfin"), ),
scrape_property(
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
),
scrape_property(location="85281", site_name="redfin"), scrape_property(location="85281", site_name="redfin"),
] ]

View File

@ -3,9 +3,15 @@ from homeharvest import scrape_property
def test_zillow(): def test_zillow():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"), location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
scrape_property(location="Dallas, TX, USA", site_name="zillow"), ),
scrape_property(
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
),
scrape_property(location="85281", site_name="zillow"), scrape_property(location="85281", site_name="zillow"),
] ]