2023-09-19 19:13:20 -07:00
"""
homeharvest . realtor . __init__
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
This module implements the scraper for relator . com
"""
2023-10-03 15:05:17 -07:00
from . . models import Property , Address , ListingType
2023-09-15 20:58:54 -07:00
from . . import Scraper
2023-09-18 08:16:59 -07:00
from . . . . exceptions import NoResultsFound
from concurrent . futures import ThreadPoolExecutor , as_completed
2023-09-15 20:58:54 -07:00
class RealtorScraper ( Scraper ) :
def __init__ ( self , scraper_input ) :
2023-09-19 19:13:20 -07:00
self . counter = 1
2023-09-15 20:58:54 -07:00
super ( ) . __init__ ( scraper_input )
2023-09-19 19:13:20 -07:00
self . search_url = (
" https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta "
)
2023-09-15 20:58:54 -07:00
def handle_location ( self ) :
headers = {
2023-09-17 13:06:31 -07:00
" authority " : " parser-external.geo.moveaws.com " ,
" accept " : " */* " ,
" accept-language " : " en-US,en;q=0.9 " ,
" origin " : " https://www.realtor.com " ,
" referer " : " https://www.realtor.com/ " ,
" sec-ch-ua " : ' " Chromium " ;v= " 116 " , " Not)A;Brand " ;v= " 24 " , " Google Chrome " ;v= " 116 " ' ,
" sec-ch-ua-mobile " : " ?0 " ,
" sec-ch-ua-platform " : ' " Windows " ' ,
" sec-fetch-dest " : " empty " ,
" sec-fetch-mode " : " cors " ,
" sec-fetch-site " : " cross-site " ,
" user-agent " : " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 " ,
2023-09-15 20:58:54 -07:00
}
params = {
2023-09-17 13:06:31 -07:00
" input " : self . location ,
2023-09-18 13:43:44 -07:00
" client_id " : self . listing_type . value . lower ( ) . replace ( " _ " , " - " ) ,
2023-09-17 13:06:31 -07:00
" limit " : " 1 " ,
" area_types " : " city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park " ,
2023-09-15 20:58:54 -07:00
}
2023-09-17 13:06:31 -07:00
response = self . session . get (
" https://parser-external.geo.moveaws.com/suggest " ,
params = params ,
headers = headers ,
)
2023-09-15 20:58:54 -07:00
response_json = response . json ( )
2023-09-18 08:16:59 -07:00
result = response_json [ " autocomplete " ]
2023-09-18 20:28:03 -07:00
if not result :
2023-09-18 08:16:59 -07:00
raise NoResultsFound ( " No results found for location: " + self . location )
return result [ 0 ]
def handle_address ( self , property_id : str ) - > list [ Property ] :
2023-09-19 19:13:20 -07:00
"""
Handles a specific address & returns one property
"""
2023-09-18 08:16:59 -07:00
query = """ query Property($property_id: ID!) {
property ( id : $ property_id ) {
property_id
details {
date_updated
garage
permalink
year_built
stories
}
address {
address_validation_code
city
country
county
line
postal_code
state_code
street_direction
street_name
street_number
street_suffix
street_post_direction
unit_value
unit
unit_descriptor
zip
}
basic {
baths
beds
price
sqft
lot_sqft
type
sold_price
}
public_record {
lot_size
sqft
stories
units
year_built
}
}
} """
2023-09-18 13:43:44 -07:00
variables = { " property_id " : property_id }
2023-09-18 08:16:59 -07:00
payload = {
2023-09-18 13:43:44 -07:00
" query " : query ,
" variables " : variables ,
2023-09-18 08:16:59 -07:00
}
response = self . session . post ( self . search_url , json = payload )
response_json = response . json ( )
2023-09-18 13:43:44 -07:00
property_info = response_json [ " data " ] [ " property " ]
2023-09-18 08:16:59 -07:00
2023-09-18 13:43:44 -07:00
return [
Property (
property_url = " https://www.realtor.com/realestateandhomes-detail/ "
2023-10-03 22:21:16 -07:00
+ property_info [ " details " ] [ " permalink " ] ,
2023-09-18 13:43:44 -07:00
stories = property_info [ " details " ] [ " stories " ] ,
mls_id = property_id ,
)
]
2023-09-18 08:16:59 -07:00
2023-10-03 23:33:53 -07:00
def general_search ( self , variables : dict , search_type : str , return_total : bool = False ) - > list [ Property ] | int :
2023-09-19 19:13:20 -07:00
"""
Handles a location area & returns a list of properties
"""
2023-10-02 13:58:47 -07:00
results_query = """ {
2023-10-03 22:21:16 -07:00
count
total
results {
property_id
list_date
status
last_sold_price
last_sold_date
hoa {
fee
}
description {
baths_full
baths_half
beds
lot_sqft
sqft
sold_price
year_built
garage
sold_price
type
sub_type
name
stories
}
source {
raw {
area
status
style
}
last_update_date
contract_date
id
listing_id
name
type
listing_href
community_id
management_id
corporation_id
subdivision_status
spec_id
plan_id
tier_rank
feed_type
}
location {
address {
city
country
line
postal_code
state_code
state
coordinate {
lon
lat
}
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
neighborhoods {
name
}
}
list_price
price_per_sqft
style_category_tags {
exterior
}
source {
id
}
}
2023-10-02 13:58:47 -07:00
}
2023-10-03 22:21:16 -07:00
} """
2023-10-02 13:58:47 -07:00
2023-10-03 15:05:17 -07:00
sold_date_param = ( ' sold_date: { min: " $today- %s D " } ' % self . sold_last_x_days
if self . listing_type == ListingType . SOLD and self . sold_last_x_days is not None
else " " )
2023-10-03 23:33:53 -07:00
if search_type == " area " :
2023-10-02 13:58:47 -07:00
query = (
2023-10-03 22:21:16 -07:00
""" query Home_search(
2023-10-02 13:58:47 -07:00
$ city : String ,
$ county : [ String ] ,
$ state_code : String ,
$ postal_code : String
$ offset : Int ,
2023-09-18 08:16:59 -07:00
) {
2023-10-02 13:58:47 -07:00
home_search (
query : {
city : $ city
county : $ county
postal_code : $ postal_code
state_code : $ state_code
status : % s
2023-10-03 15:05:17 -07:00
% s
2023-09-18 08:16:59 -07:00
}
2023-10-02 13:58:47 -07:00
limit : 200
offset : $ offset
) % s """
2023-10-03 22:21:16 -07:00
% (
self . listing_type . value . lower ( ) ,
sold_date_param ,
results_query
)
2023-10-03 15:05:17 -07:00
)
2023-10-03 23:33:53 -07:00
elif search_type == " comp_address " :
2023-10-02 13:58:47 -07:00
query = (
2023-10-03 22:21:16 -07:00
""" query Property_search(
2023-10-02 13:58:47 -07:00
$ coordinates : [ Float ] !
$ radius : String !
$ offset : Int ! ,
) {
property_search (
2023-10-03 15:05:17 -07:00
query : {
nearby : {
coordinates : $ coordinates
radius : $ radius
}
% s
}
2023-10-02 13:58:47 -07:00
limit : 200
offset : $ offset
2023-10-03 15:05:17 -07:00
) % s """ % (sold_date_param, results_query))
2023-10-03 23:33:53 -07:00
else :
query = (
""" query Property_search(
$ property_id : [ ID ] !
$ offset : Int ! ,
) {
property_search (
query : {
property_id : $ property_id
% s
}
limit : 200
offset : $ offset
) % s """ % (sold_date_param, results_query))
2023-09-18 08:16:59 -07:00
payload = {
2023-09-18 13:43:44 -07:00
" query " : query ,
" variables " : variables ,
2023-09-18 08:16:59 -07:00
}
response = self . session . post ( self . search_url , json = payload )
2023-09-18 13:43:44 -07:00
response . raise_for_status ( )
2023-09-18 08:16:59 -07:00
response_json = response . json ( )
2023-10-03 23:33:53 -07:00
search_key = " home_search " if search_type == " area " else " property_search "
2023-09-18 08:16:59 -07:00
if return_total :
2023-10-02 13:58:47 -07:00
return response_json [ " data " ] [ search_key ] [ " total " ]
2023-09-18 08:16:59 -07:00
properties : list [ Property ] = [ ]
2023-09-18 13:43:44 -07:00
if (
2023-10-03 22:21:16 -07:00
response_json is None
or " data " not in response_json
or response_json [ " data " ] is None
or search_key not in response_json [ " data " ]
or response_json [ " data " ] [ search_key ] is None
or " results " not in response_json [ " data " ] [ search_key ]
2023-09-18 13:43:44 -07:00
) :
return [ ]
2023-10-02 13:58:47 -07:00
for result in response_json [ " data " ] [ search_key ] [ " results " ] :
2023-09-19 19:13:20 -07:00
self . counter + = 1
2023-10-03 22:21:16 -07:00
mls = (
result [ " source " ] . get ( " id " )
if " source " in result and isinstance ( result [ " source " ] , dict )
else None
)
mls_id = (
result [ " source " ] . get ( " listing_id " )
if " source " in result and isinstance ( result [ " source " ] , dict )
else None
)
if not mls_id :
continue
# not type
neighborhoods_list = [ ]
neighborhoods = result [ " location " ] . get ( " neighborhoods " , [ ] )
if neighborhoods :
for neighborhood in neighborhoods :
name = neighborhood . get ( " name " )
if name :
neighborhoods_list . append ( name )
neighborhoods_str = (
" , " . join ( neighborhoods_list ) if neighborhoods_list else None
)
able_to_get_lat_long = result and result . get ( " location " ) and result [ " location " ] . get ( " address " ) and result [ " location " ] [ " address " ] . get ( " coordinate " )
2023-09-18 08:16:59 -07:00
realty_property = Property (
2023-10-03 22:21:16 -07:00
property_url = " https://www.realtor.com/realestateandhomes-detail/ "
+ result [ " property_id " ] ,
mls = mls ,
mls_id = mls_id ,
status = result [ " status " ] . upper ( ) ,
style = result [ " description " ] [ " type " ] . upper ( ) ,
beds = result [ " description " ] [ " beds " ] ,
baths_full = result [ " description " ] [ " baths_full " ] ,
baths_half = result [ " description " ] [ " baths_half " ] ,
est_sf = result [ " description " ] [ " sqft " ] ,
lot_sf = result [ " description " ] [ " lot_sqft " ] ,
list_price = result [ " list_price " ] ,
list_date = result [ " list_date " ] . split ( " T " ) [ 0 ]
if result [ " list_date " ]
else None ,
sold_price = result [ " description " ] [ " sold_price " ] ,
prc_sqft = result [ " price_per_sqft " ] ,
last_sold_date = result [ " last_sold_date " ] ,
hoa_fee = result [ " hoa " ] [ " fee " ] if result . get ( " hoa " ) and isinstance ( result [ " hoa " ] , dict ) else None ,
2023-09-18 08:16:59 -07:00
address = Address (
2023-10-03 22:21:16 -07:00
street = f " { result [ ' location ' ] [ ' address ' ] [ ' street_number ' ] } { result [ ' location ' ] [ ' address ' ] [ ' street_name ' ] } { result [ ' location ' ] [ ' address ' ] [ ' street_suffix ' ] } " ,
unit = result [ " location " ] [ " address " ] [ " unit " ] ,
2023-09-18 13:43:44 -07:00
city = result [ " location " ] [ " address " ] [ " city " ] ,
state = result [ " location " ] [ " address " ] [ " state_code " ] ,
2023-10-03 22:21:16 -07:00
zip = result [ " location " ] [ " address " ] [ " postal_code " ] ,
2023-09-18 08:16:59 -07:00
) ,
2023-10-03 22:21:16 -07:00
yr_blt = result [ " description " ] [ " year_built " ] ,
latitude = result [ " location " ] [ " address " ] [ " coordinate " ] . get ( " lat " ) if able_to_get_lat_long else None ,
longitude = result [ " location " ] [ " address " ] [ " coordinate " ] . get ( " lon " ) if able_to_get_lat_long else None ,
prkg_gar = result [ " description " ] [ " garage " ] ,
2023-09-18 13:43:44 -07:00
stories = result [ " description " ] [ " stories " ] ,
2023-10-03 22:21:16 -07:00
neighborhoods = neighborhoods_str ,
2023-09-18 08:16:59 -07:00
)
properties . append ( realty_property )
return properties
2023-09-16 13:39:03 -07:00
2023-09-15 20:58:54 -07:00
def search ( self ) :
location_info = self . handle_location ( )
2023-09-17 13:06:31 -07:00
location_type = location_info [ " area_type " ]
2023-10-02 13:58:47 -07:00
is_for_comps = self . radius is not None and location_type == " address "
2023-09-16 13:39:03 -07:00
2023-10-03 23:33:53 -07:00
offset = 0
search_variables = {
" offset " : offset ,
}
search_type = " comp_address " if is_for_comps \
else " address " if location_type == " address " and not is_for_comps \
else " area "
if location_type == " address " and not is_for_comps : #: single address search, non comps
2023-09-18 13:43:44 -07:00
property_id = location_info [ " mpr_id " ]
2023-10-03 23:33:53 -07:00
search_variables = search_variables | { " property_id " : property_id }
2023-09-18 08:16:59 -07:00
2023-10-03 23:33:53 -07:00
general_search = self . general_search ( search_variables , search_type )
if general_search :
return general_search
else :
return self . handle_address ( property_id ) #: TODO: support single address search for query by property address (can go from property -> listing to get better data)
2023-09-18 08:16:59 -07:00
2023-10-03 23:33:53 -07:00
elif not is_for_comps : #: area search
search_variables = search_variables | {
2023-10-02 13:58:47 -07:00
" city " : location_info . get ( " city " ) ,
" county " : location_info . get ( " county " ) ,
" state_code " : location_info . get ( " state_code " ) ,
" postal_code " : location_info . get ( " postal_code " ) ,
}
2023-10-03 23:33:53 -07:00
else : #: comps search
2023-10-02 13:58:47 -07:00
coordinates = list ( location_info [ " centroid " ] . values ( ) )
2023-10-03 23:33:53 -07:00
search_variables = search_variables | {
2023-10-02 13:58:47 -07:00
" coordinates " : coordinates ,
" radius " : " {} mi " . format ( self . radius ) ,
}
2023-10-03 23:33:53 -07:00
total = self . general_search ( search_variables , return_total = True , search_type = search_type )
2023-09-18 08:16:59 -07:00
homes = [ ]
with ThreadPoolExecutor ( max_workers = 10 ) as executor :
futures = [
executor . submit (
2023-10-03 23:33:53 -07:00
self . general_search ,
2023-09-18 13:43:44 -07:00
variables = search_variables | { " offset " : i } ,
return_total = False ,
2023-10-03 23:33:53 -07:00
search_type = search_type ,
2023-09-18 13:43:44 -07:00
)
for i in range ( 0 , total , 200 )
2023-09-18 08:16:59 -07:00
]
for future in as_completed ( futures ) :
homes . extend ( future . result ( ) )
return homes