2023-09-19 19:13:20 -07:00
"""
homeharvest . realtor . __init__
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
2023-10-04 06:58:55 -07:00
This module implements the scraper for realtor . com
2023-09-19 19:13:20 -07:00
"""
2024-04-16 12:55:44 -07:00
2024-04-30 13:29:54 -07:00
from concurrent . futures import ThreadPoolExecutor , as_completed
2023-10-09 09:00:36 -07:00
from datetime import datetime
2023-10-04 06:58:55 -07:00
from typing import Dict , Union , Optional
2023-09-15 20:58:54 -07:00
from . . import Scraper
2024-05-11 21:35:29 -07:00
from . . models import Property , Address , ListingType , Description , PropertyType , Agent , Broker
2023-09-15 20:58:54 -07:00
class RealtorScraper ( Scraper ) :
2023-10-04 10:07:32 -07:00
SEARCH_GQL_URL = " https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta "
2023-10-04 06:58:55 -07:00
PROPERTY_URL = " https://www.realtor.com/realestateandhomes-detail/ "
2024-04-16 12:55:44 -07:00
PROPERTY_GQL = " https://graph.realtor.com/graphql "
2023-10-04 06:58:55 -07:00
ADDRESS_AUTOCOMPLETE_URL = " https://parser-external.geo.moveaws.com/suggest "
2024-04-16 12:55:44 -07:00
NUM_PROPERTY_WORKERS = 20
2023-10-04 06:58:55 -07:00
2023-09-15 20:58:54 -07:00
def __init__ ( self , scraper_input ) :
super ( ) . __init__ ( scraper_input )
def handle_location ( self ) :
params = {
2023-09-17 13:06:31 -07:00
" input " : self . location ,
2023-09-18 13:43:44 -07:00
" client_id " : self . listing_type . value . lower ( ) . replace ( " _ " , " - " ) ,
2023-09-17 13:06:31 -07:00
" limit " : " 1 " ,
" area_types " : " city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park " ,
2023-09-15 20:58:54 -07:00
}
2023-09-17 13:06:31 -07:00
response = self . session . get (
2023-10-04 06:58:55 -07:00
self . ADDRESS_AUTOCOMPLETE_URL ,
2023-09-17 13:06:31 -07:00
params = params ,
)
2023-09-15 20:58:54 -07:00
response_json = response . json ( )
2023-09-18 08:16:59 -07:00
result = response_json [ " autocomplete " ]
2023-09-18 20:28:03 -07:00
if not result :
2023-11-03 16:35:41 -07:00
return None
2023-09-18 08:16:59 -07:00
return result [ 0 ]
2023-10-04 10:07:32 -07:00
def handle_listing ( self , listing_id : str ) - > list [ Property ] :
query = """ query Listing($listing_id: ID!) {
listing ( id : $ listing_id ) {
source {
id
listing_id
}
address {
2023-11-08 14:52:06 -08:00
street_direction
2023-10-04 10:07:32 -07:00
street_number
street_name
street_suffix
unit
city
state_code
postal_code
location {
coordinate {
lat
lon
}
}
}
basic {
sqft
beds
baths_full
baths_half
lot_sqft
sold_price
sold_price
type
price
status
sold_date
list_date
}
details {
year_built
stories
garage
permalink
}
2024-03-13 19:17:17 -07:00
media {
photos {
href
}
2023-11-24 11:41:46 -08:00
}
2023-10-04 10:07:32 -07:00
}
} """
variables = { " listing_id " : listing_id }
payload = {
" query " : query ,
" variables " : variables ,
}
response = self . session . post ( self . SEARCH_GQL_URL , json = payload )
response_json = response . json ( )
property_info = response_json [ " data " ] [ " listing " ]
mls = (
property_info [ " source " ] . get ( " id " )
if " source " in property_info and isinstance ( property_info [ " source " ] , dict )
else None
)
able_to_get_lat_long = (
2024-04-16 12:55:44 -07:00
property_info
and property_info . get ( " address " )
and property_info [ " address " ] . get ( " location " )
and property_info [ " address " ] [ " location " ] . get ( " coordinate " )
)
list_date_str = (
property_info [ " basic " ] [ " list_date " ] . split ( " T " ) [ 0 ] if property_info [ " basic " ] . get ( " list_date " ) else None
)
last_sold_date_str = (
property_info [ " basic " ] [ " sold_date " ] . split ( " T " ) [ 0 ] if property_info [ " basic " ] . get ( " sold_date " ) else None
2023-10-04 10:07:32 -07:00
)
2024-03-13 19:17:17 -07:00
pending_date_str = property_info [ " pending_date " ] . split ( " T " ) [ 0 ] if property_info . get ( " pending_date " ) else None
2023-10-09 09:00:36 -07:00
list_date = datetime . strptime ( list_date_str , " % Y- % m- %d " ) if list_date_str else None
last_sold_date = datetime . strptime ( last_sold_date_str , " % Y- % m- %d " ) if last_sold_date_str else None
2024-03-13 19:17:17 -07:00
pending_date = datetime . strptime ( pending_date_str , " % Y- % m- %d " ) if pending_date_str else None
2023-10-09 09:00:36 -07:00
today = datetime . now ( )
days_on_mls = None
status = property_info [ " basic " ] [ " status " ] . lower ( )
if list_date :
if status == " sold " and last_sold_date :
days_on_mls = ( last_sold_date - list_date ) . days
2024-04-16 12:55:44 -07:00
elif status in ( " for_sale " , " for_rent " ) :
2023-10-09 09:00:36 -07:00
days_on_mls = ( today - list_date ) . days
if days_on_mls and days_on_mls < 0 :
days_on_mls = None
2023-10-04 10:07:32 -07:00
2024-04-16 13:09:32 -07:00
property_id = property_info [ " details " ] [ " permalink " ]
2024-04-30 13:29:54 -07:00
prop_details = self . get_prop_details ( property_id )
2023-10-04 10:07:32 -07:00
listing = Property (
mls = mls ,
2024-04-16 12:55:44 -07:00
mls_id = (
property_info [ " source " ] . get ( " listing_id " )
if " source " in property_info and isinstance ( property_info [ " source " ] , dict )
else None
) ,
2024-04-16 13:09:32 -07:00
property_url = f " { self . PROPERTY_URL } { property_id } " ,
2023-10-04 10:07:32 -07:00
status = property_info [ " basic " ] [ " status " ] . upper ( ) ,
list_price = property_info [ " basic " ] [ " price " ] ,
2023-10-09 09:00:36 -07:00
list_date = list_date ,
2024-04-16 12:55:44 -07:00
prc_sqft = (
property_info [ " basic " ] . get ( " price " ) / property_info [ " basic " ] . get ( " sqft " )
if property_info [ " basic " ] . get ( " price " ) and property_info [ " basic " ] . get ( " sqft " )
else None
) ,
2023-10-09 09:00:36 -07:00
last_sold_date = last_sold_date ,
2024-03-13 19:17:17 -07:00
pending_date = pending_date ,
2024-04-16 12:55:44 -07:00
latitude = property_info [ " address " ] [ " location " ] [ " coordinate " ] . get ( " lat " ) if able_to_get_lat_long else None ,
longitude = property_info [ " address " ] [ " location " ] [ " coordinate " ] . get ( " lon " ) if able_to_get_lat_long else None ,
2023-10-04 10:07:32 -07:00
address = self . _parse_address ( property_info , search_type = " handle_listing " ) ,
description = Description (
2024-03-13 19:17:17 -07:00
alt_photos = self . process_alt_photos ( property_info . get ( " media " , { } ) . get ( " photos " , [ ] ) ) ,
2023-10-04 10:07:32 -07:00
style = property_info [ " basic " ] . get ( " type " , " " ) . upper ( ) ,
beds = property_info [ " basic " ] . get ( " beds " ) ,
baths_full = property_info [ " basic " ] . get ( " baths_full " ) ,
baths_half = property_info [ " basic " ] . get ( " baths_half " ) ,
sqft = property_info [ " basic " ] . get ( " sqft " ) ,
lot_sqft = property_info [ " basic " ] . get ( " lot_sqft " ) ,
sold_price = property_info [ " basic " ] . get ( " sold_price " ) ,
year_built = property_info [ " details " ] . get ( " year_built " ) ,
garage = property_info [ " details " ] . get ( " garage " ) ,
stories = property_info [ " details " ] . get ( " stories " ) ,
2024-04-30 13:29:54 -07:00
text = property_info . get ( " description " , { } ) . get ( " text " ) ,
2023-10-04 20:33:21 -07:00
) ,
2024-04-16 12:55:44 -07:00
days_on_mls = days_on_mls ,
2024-04-30 13:29:54 -07:00
agents = prop_details . get ( " agents " ) ,
2024-05-11 21:35:29 -07:00
brokers = prop_details . get ( " brokers " ) ,
2024-04-30 13:29:54 -07:00
nearby_schools = prop_details . get ( " schools " ) ,
assessed_value = prop_details . get ( " assessed_value " ) ,
estimated_value = prop_details . get ( " estimated_value " ) ,
2023-10-04 10:07:32 -07:00
)
return [ listing ]
def get_latest_listing_id ( self , property_id : str ) - > str | None :
query = """ query Property($property_id: ID!) {
property ( id : $ property_id ) {
listings {
listing_id
primary
}
}
}
"""
variables = { " property_id " : property_id }
payload = {
" query " : query ,
" variables " : variables ,
}
response = self . session . post ( self . SEARCH_GQL_URL , json = payload )
response_json = response . json ( )
property_info = response_json [ " data " ] [ " property " ]
if property_info [ " listings " ] is None :
return None
2023-10-04 20:33:21 -07:00
primary_listing = next (
( listing for listing in property_info [ " listings " ] if listing [ " primary " ] ) ,
None ,
)
2023-10-04 10:07:32 -07:00
if primary_listing :
return primary_listing [ " listing_id " ]
else :
return property_info [ " listings " ] [ 0 ] [ " listing_id " ]
2023-09-18 08:16:59 -07:00
def handle_address ( self , property_id : str ) - > list [ Property ] :
2023-09-19 19:13:20 -07:00
"""
Handles a specific address & returns one property
"""
2023-09-18 08:16:59 -07:00
query = """ query Property($property_id: ID!) {
property ( id : $ property_id ) {
property_id
details {
date_updated
garage
permalink
year_built
stories
}
address {
2023-11-08 14:52:06 -08:00
street_direction
2023-09-18 08:16:59 -07:00
street_number
2023-10-04 06:58:55 -07:00
street_name
2023-09-18 08:16:59 -07:00
street_suffix
unit
2023-10-04 06:58:55 -07:00
city
state_code
postal_code
location {
coordinate {
lat
lon
}
}
2023-09-18 08:16:59 -07:00
}
basic {
baths
beds
price
sqft
lot_sqft
type
sold_price
}
public_record {
lot_size
sqft
stories
units
year_built
}
2023-11-24 11:41:46 -08:00
primary_photo {
href
}
photos {
href
}
2023-09-18 08:16:59 -07:00
}
} """
2023-09-18 13:43:44 -07:00
variables = { " property_id " : property_id }
2024-04-30 13:29:54 -07:00
prop_details = self . get_prop_details ( property_id )
2023-09-18 08:16:59 -07:00
payload = {
2023-09-18 13:43:44 -07:00
" query " : query ,
" variables " : variables ,
2023-09-18 08:16:59 -07:00
}
2023-10-04 10:07:32 -07:00
response = self . session . post ( self . SEARCH_GQL_URL , json = payload )
2023-09-18 08:16:59 -07:00
response_json = response . json ( )
2023-09-18 13:43:44 -07:00
property_info = response_json [ " data " ] [ " property " ]
2023-09-18 08:16:59 -07:00
2023-09-18 13:43:44 -07:00
return [
Property (
mls_id = property_id ,
2023-10-04 06:58:55 -07:00
property_url = f " { self . PROPERTY_URL } { property_info [ ' details ' ] [ ' permalink ' ] } " ,
2024-04-16 12:55:44 -07:00
address = self . _parse_address ( property_info , search_type = " handle_address " ) ,
2023-10-04 08:11:53 -07:00
description = self . _parse_description ( property_info ) ,
2024-04-30 13:29:54 -07:00
agents = prop_details . get ( " agents " ) ,
2024-05-11 21:35:29 -07:00
brokers = prop_details . get ( " brokers " ) ,
2024-04-30 13:29:54 -07:00
nearby_schools = prop_details . get ( " schools " ) ,
assessed_value = prop_details . get ( " assessed_value " ) ,
estimated_value = prop_details . get ( " estimated_value " ) ,
2023-09-18 13:43:44 -07:00
)
]
2023-09-18 08:16:59 -07:00
2024-04-16 12:55:44 -07:00
def general_search ( self , variables : dict , search_type : str ) - > Dict [ str , Union [ int , list [ Property ] ] ] :
2023-09-19 19:13:20 -07:00
"""
Handles a location area & returns a list of properties
"""
2023-10-02 13:58:47 -07:00
results_query = """ {
2023-10-03 22:21:16 -07:00
count
total
results {
2024-03-13 19:17:17 -07:00
pending_date
2023-10-03 22:21:16 -07:00
property_id
list_date
status
last_sold_price
last_sold_date
2023-10-04 06:58:55 -07:00
list_price
price_per_sqft
2023-10-05 11:43:00 -07:00
flags {
is_contingent
is_pending
}
2023-10-03 22:21:16 -07:00
description {
2024-03-13 19:17:17 -07:00
type
2023-10-04 06:58:55 -07:00
sqft
beds
2023-10-03 22:21:16 -07:00
baths_full
baths_half
lot_sqft
sold_price
year_built
garage
sold_price
type
name
stories
2024-04-20 15:44:28 -07:00
text
2023-10-03 22:21:16 -07:00
}
source {
id
listing_id
2023-10-04 06:58:55 -07:00
}
hoa {
fee
2023-10-03 22:21:16 -07:00
}
location {
address {
2023-11-08 14:52:06 -08:00
street_direction
2023-10-04 06:58:55 -07:00
street_number
street_name
street_suffix
unit
2023-10-03 22:21:16 -07:00
city
state_code
2023-10-04 06:58:55 -07:00
postal_code
2023-10-03 22:21:16 -07:00
coordinate {
lon
lat
}
}
2024-04-20 15:44:28 -07:00
county {
name
fips_code
}
2023-10-03 22:21:16 -07:00
neighborhoods {
2023-10-04 06:58:55 -07:00
name
2023-10-03 22:21:16 -07:00
}
}
2024-04-20 15:44:28 -07:00
tax_record {
public_record_id
}
2023-11-24 11:41:46 -08:00
primary_photo {
href
}
photos {
href
}
2023-10-03 22:21:16 -07:00
}
2023-10-02 13:58:47 -07:00
}
2023-10-03 22:21:16 -07:00
} """
2023-10-02 13:58:47 -07:00
2023-11-03 16:35:41 -07:00
date_param = " "
if self . listing_type == ListingType . SOLD :
if self . date_from and self . date_to :
date_param = f ' sold_date: {{ min: " { self . date_from } " , max: " { self . date_to } " }} '
elif self . last_x_days :
date_param = f ' sold_date: {{ min: " $today- { self . last_x_days } D " }} '
else :
if self . date_from and self . date_to :
date_param = f ' list_date: {{ min: " { self . date_from } " , max: " { self . date_to } " }} '
elif self . last_x_days :
date_param = f ' list_date: {{ min: " $today- { self . last_x_days } D " }} '
2023-10-04 10:07:32 -07:00
2023-10-04 08:11:53 -07:00
sort_param = (
" sort: [ { field: sold_date, direction: desc }] "
if self . listing_type == ListingType . SOLD
else " sort: [ { field: list_date, direction: desc }] "
)
2023-10-03 15:05:17 -07:00
2023-10-04 20:33:21 -07:00
pending_or_contingent_param = (
2024-04-16 12:55:44 -07:00
" or_filters: { contingent: true, pending: true } " if self . listing_type == ListingType . PENDING else " "
2023-10-04 20:33:21 -07:00
)
2024-03-06 15:31:06 -08:00
2023-10-05 11:43:00 -07:00
listing_type = ListingType . FOR_SALE if self . listing_type == ListingType . PENDING else self . listing_type
2024-03-03 09:45:28 -08:00
is_foreclosure = " "
2024-03-06 15:31:06 -08:00
2024-04-16 12:55:44 -07:00
if variables . get ( " foreclosure " ) is True :
2024-03-06 15:31:06 -08:00
is_foreclosure = " foreclosure: true "
2024-04-16 12:55:44 -07:00
elif variables . get ( " foreclosure " ) is False :
2024-03-06 15:31:06 -08:00
is_foreclosure = " foreclosure: false "
2023-10-04 18:25:01 -07:00
if search_type == " comps " : #: comps search, came from an address
2023-10-04 08:11:53 -07:00
query = """ query Property_search(
2023-10-04 06:58:55 -07:00
$ coordinates : [ Float ] !
$ radius : String !
$ offset : Int ! ,
) {
2023-10-05 11:43:00 -07:00
home_search (
2024-04-16 12:55:44 -07:00
query : {
2024-03-03 09:45:28 -08:00
% s
2023-10-04 06:58:55 -07:00
nearby : {
coordinates : $ coordinates
2024-04-16 12:55:44 -07:00
radius : $ radius
2023-10-04 06:58:55 -07:00
}
status : % s
% s
2023-10-05 11:43:00 -07:00
% s
2023-10-04 06:58:55 -07:00
}
% s
limit : 200
offset : $ offset
) % s """ % (
2024-03-06 15:31:06 -08:00
is_foreclosure ,
2023-10-05 11:43:00 -07:00
listing_type . value . lower ( ) ,
2023-10-04 08:11:53 -07:00
date_param ,
2023-10-05 11:43:00 -07:00
pending_or_contingent_param ,
2023-10-04 08:11:53 -07:00
sort_param ,
results_query ,
2023-10-04 06:58:55 -07:00
)
2023-10-04 18:25:01 -07:00
elif search_type == " area " : #: general search, came from a general location
2023-10-04 08:11:53 -07:00
query = """ query Home_search(
2023-10-02 13:58:47 -07:00
$ city : String ,
$ county : [ String ] ,
$ state_code : String ,
$ postal_code : String
$ offset : Int ,
2023-09-18 08:16:59 -07:00
) {
2023-10-02 13:58:47 -07:00
home_search (
query : {
2024-03-03 09:45:28 -08:00
% s
2023-10-02 13:58:47 -07:00
city : $ city
county : $ county
postal_code : $ postal_code
state_code : $ state_code
status : % s
2023-10-03 15:05:17 -07:00
% s
2023-10-04 18:25:01 -07:00
% s
2023-09-18 08:16:59 -07:00
}
2023-10-04 06:58:55 -07:00
% s
2023-10-02 13:58:47 -07:00
limit : 200
offset : $ offset
2023-10-04 08:11:53 -07:00
) % s """ % (
2024-03-03 09:45:28 -08:00
is_foreclosure ,
2023-10-05 11:43:00 -07:00
listing_type . value . lower ( ) ,
2023-10-04 08:11:53 -07:00
date_param ,
2023-10-04 18:25:01 -07:00
pending_or_contingent_param ,
2023-10-04 08:11:53 -07:00
sort_param ,
results_query ,
2023-10-03 15:05:17 -07:00
)
2023-10-04 18:25:01 -07:00
else : #: general search, came from an address
2023-10-04 10:07:32 -07:00
query = (
2024-04-16 12:55:44 -07:00
""" query Property_search(
2023-10-04 10:07:32 -07:00
$ property_id : [ ID ] !
$ offset : Int ! ,
) {
property_search (
query : {
property_id : $ property_id
}
limit : 1
offset : $ offset
2023-10-04 20:33:21 -07:00
) % s """
2024-04-16 12:55:44 -07:00
% results_query
2023-10-04 20:33:21 -07:00
)
2023-09-18 08:16:59 -07:00
payload = {
2023-09-18 13:43:44 -07:00
" query " : query ,
" variables " : variables ,
2023-09-18 08:16:59 -07:00
}
2023-10-04 10:07:32 -07:00
response = self . session . post ( self . SEARCH_GQL_URL , json = payload )
2023-09-18 08:16:59 -07:00
response_json = response . json ( )
2023-10-05 11:43:00 -07:00
search_key = " home_search " if " home_search " in query else " property_search "
2023-09-18 08:16:59 -07:00
properties : list [ Property ] = [ ]
2023-09-18 13:43:44 -07:00
if (
2024-04-16 12:55:44 -07:00
response_json is None
or " data " not in response_json
or response_json [ " data " ] is None
or search_key not in response_json [ " data " ]
or response_json [ " data " ] [ search_key ] is None
or " results " not in response_json [ " data " ] [ search_key ]
2023-09-18 13:43:44 -07:00
) :
2023-10-04 06:58:55 -07:00
return { " total " : 0 , " properties " : [ ] }
2023-09-18 13:43:44 -07:00
2024-04-16 12:55:44 -07:00
def process_property ( result : dict ) - > Property | None :
mls = result [ " source " ] . get ( " id " ) if " source " in result and isinstance ( result [ " source " ] , dict ) else None
2023-10-03 22:21:16 -07:00
2023-10-04 08:11:53 -07:00
if not mls and self . mls_only :
2024-04-16 12:55:44 -07:00
return
2023-10-03 22:21:16 -07:00
2023-10-04 08:11:53 -07:00
able_to_get_lat_long = (
2024-04-16 12:55:44 -07:00
result
and result . get ( " location " )
and result [ " location " ] . get ( " address " )
and result [ " location " ] [ " address " ] . get ( " coordinate " )
2023-10-04 08:11:53 -07:00
)
2023-10-03 22:21:16 -07:00
2023-10-05 11:43:00 -07:00
is_pending = result [ " flags " ] . get ( " is_pending " ) or result [ " flags " ] . get ( " is_contingent " )
2023-10-18 14:41:41 -07:00
if is_pending and self . listing_type != ListingType . PENDING :
2024-04-16 12:55:44 -07:00
return
property_id = result [ " property_id " ]
2024-04-30 13:29:54 -07:00
prop_details = self . get_prop_details ( property_id )
2023-10-18 14:41:41 -07:00
2023-09-18 08:16:59 -07:00
realty_property = Property (
2023-10-03 22:21:16 -07:00
mls = mls ,
2024-04-16 12:55:44 -07:00
mls_id = (
result [ " source " ] . get ( " listing_id " )
if " source " in result and isinstance ( result [ " source " ] , dict )
else None
) ,
property_url = (
f " { self . PROPERTY_URL } { property_id } "
if self . listing_type != ListingType . FOR_RENT
else f " { self . PROPERTY_URL } M { property_id } ?listing_status=rental "
) ,
2023-10-05 11:43:00 -07:00
status = " PENDING " if is_pending else result [ " status " ] . upper ( ) ,
2023-10-03 22:21:16 -07:00
list_price = result [ " list_price " ] ,
2024-04-16 12:55:44 -07:00
list_date = result [ " list_date " ] . split ( " T " ) [ 0 ] if result . get ( " list_date " ) else None ,
2023-10-04 06:58:55 -07:00
prc_sqft = result . get ( " price_per_sqft " ) ,
last_sold_date = result . get ( " last_sold_date " ) ,
2024-04-16 12:55:44 -07:00
hoa_fee = result [ " hoa " ] [ " fee " ] if result . get ( " hoa " ) and isinstance ( result [ " hoa " ] , dict ) else None ,
latitude = result [ " location " ] [ " address " ] [ " coordinate " ] . get ( " lat " ) if able_to_get_lat_long else None ,
longitude = result [ " location " ] [ " address " ] [ " coordinate " ] . get ( " lon " ) if able_to_get_lat_long else None ,
2023-10-04 06:58:55 -07:00
address = self . _parse_address ( result , search_type = " general_search " ) ,
2023-10-04 08:11:53 -07:00
description = self . _parse_description ( result ) ,
2024-04-20 15:44:28 -07:00
neighborhoods = self . _parse_neighborhoods ( result ) ,
2024-04-30 13:29:54 -07:00
county = result [ " location " ] [ " county " ] . get ( " name " ) if result [ " location " ] [ " county " ] else None ,
fips_code = result [ " location " ] [ " county " ] . get ( " fips_code " ) if result [ " location " ] [ " county " ] else None ,
2024-04-16 12:55:44 -07:00
days_on_mls = self . calculate_days_on_mls ( result ) ,
2024-04-30 13:29:54 -07:00
agents = prop_details . get ( " agents " ) ,
2024-05-11 21:35:29 -07:00
brokers = prop_details . get ( " brokers " ) ,
2024-04-30 13:29:54 -07:00
nearby_schools = prop_details . get ( " schools " ) ,
assessed_value = prop_details . get ( " assessed_value " ) ,
estimated_value = prop_details . get ( " estimated_value " ) ,
2023-09-18 08:16:59 -07:00
)
2024-04-16 12:55:44 -07:00
return realty_property
with ThreadPoolExecutor ( max_workers = self . NUM_PROPERTY_WORKERS ) as executor :
futures = [
executor . submit ( process_property , result ) for result in response_json [ " data " ] [ search_key ] [ " results " ]
]
for future in as_completed ( futures ) :
result = future . result ( )
if result :
properties . append ( result )
2023-09-18 08:16:59 -07:00
2023-10-04 06:58:55 -07:00
return {
" total " : response_json [ " data " ] [ search_key ] [ " total " ] ,
" properties " : properties ,
}
2023-09-16 13:39:03 -07:00
2023-09-15 20:58:54 -07:00
def search ( self ) :
location_info = self . handle_location ( )
2023-11-03 16:35:41 -07:00
if not location_info :
return [ ]
2023-09-17 13:06:31 -07:00
location_type = location_info [ " area_type " ]
2023-09-16 13:39:03 -07:00
2023-10-03 23:33:53 -07:00
search_variables = {
2023-10-04 06:58:55 -07:00
" offset " : 0 ,
2023-10-03 23:33:53 -07:00
}
2024-03-06 15:31:06 -08:00
2023-10-04 20:33:21 -07:00
search_type = (
" comps "
if self . radius and location_type == " address "
2024-04-16 12:55:44 -07:00
else " address " if location_type == " address " and not self . radius else " area "
2023-10-04 20:33:21 -07:00
)
2023-10-04 06:58:55 -07:00
if location_type == " address " :
2023-10-04 08:11:53 -07:00
if not self . radius : #: single address search, non comps
2023-10-04 06:58:55 -07:00
property_id = location_info [ " mpr_id " ]
search_variables | = { " property_id " : property_id }
2023-10-04 10:07:32 -07:00
2024-04-16 12:55:44 -07:00
gql_results = self . general_search ( search_variables , search_type = search_type )
2023-10-04 10:07:32 -07:00
if gql_results [ " total " ] == 0 :
listing_id = self . get_latest_listing_id ( property_id )
if listing_id is None :
return self . handle_address ( property_id )
else :
return self . handle_listing ( listing_id )
else :
return gql_results [ " properties " ]
2023-10-04 06:58:55 -07:00
2023-10-04 08:11:53 -07:00
else : #: general search, comps (radius)
2024-03-06 15:31:06 -08:00
if not location_info . get ( " centroid " ) :
return [ ]
2023-10-04 06:58:55 -07:00
coordinates = list ( location_info [ " centroid " ] . values ( ) )
search_variables | = {
" coordinates " : coordinates ,
" radius " : " {} mi " . format ( self . radius ) ,
}
2023-12-01 22:39:28 -08:00
elif location_type == " postal_code " :
search_variables | = {
" postal_code " : location_info . get ( " postal_code " ) ,
}
2023-10-04 06:58:55 -07:00
else : #: general search, location
search_variables | = {
2023-10-02 13:58:47 -07:00
" city " : location_info . get ( " city " ) ,
" county " : location_info . get ( " county " ) ,
" state_code " : location_info . get ( " state_code " ) ,
" postal_code " : location_info . get ( " postal_code " ) ,
}
2024-03-06 15:31:06 -08:00
if self . foreclosure :
2024-04-16 12:55:44 -07:00
search_variables [ " foreclosure " ] = self . foreclosure
2024-03-03 09:45:28 -08:00
2023-10-04 06:58:55 -07:00
result = self . general_search ( search_variables , search_type = search_type )
total = result [ " total " ]
homes = result [ " properties " ]
2023-09-18 08:16:59 -07:00
with ThreadPoolExecutor ( max_workers = 10 ) as executor :
futures = [
executor . submit (
2023-10-03 23:33:53 -07:00
self . general_search ,
2023-09-18 13:43:44 -07:00
variables = search_variables | { " offset " : i } ,
2023-10-03 23:33:53 -07:00
search_type = search_type ,
2023-09-18 13:43:44 -07:00
)
2023-10-04 06:58:55 -07:00
for i in range ( 200 , min ( total , 10000 ) , 200 )
2023-09-18 08:16:59 -07:00
]
for future in as_completed ( futures ) :
2023-10-04 06:58:55 -07:00
homes . extend ( future . result ( ) [ " properties " ] )
2023-09-18 08:16:59 -07:00
return homes
2023-10-04 06:58:55 -07:00
2024-04-30 13:29:54 -07:00
def get_prop_details ( self , property_id : str ) - > dict :
2024-05-02 09:04:49 -07:00
if not self . extra_property_data :
return { }
2024-05-02 08:48:53 -07:00
query = """ query GetHome($property_id: ID!) {
home ( property_id : $ property_id ) {
__typename
advertisers {
__typename
type
name
email
phones { number type ext primary }
}
2024-05-11 21:35:29 -07:00
consumer_advertisers {
name
phone
href
type
}
2024-05-02 08:48:53 -07:00
nearbySchools : nearby_schools ( radius : 5.0 , limit_per_level : 3 ) {
__typename schools { district { __typename id name } }
}
taxHistory : tax_history { __typename tax year assessment { __typename building land total } }
estimates {
__typename
currentValues : current_values {
__typename
source { __typename type name }
estimate
estimateHigh : estimate_high
estimateLow : estimate_low
date
isBestHomeValue : isbest_homevalue
}
}
}
} """
variables = { " property_id " : property_id }
response = self . session . post ( self . PROPERTY_GQL , json = { " query " : query , " variables " : variables } )
data = response . json ( )
2024-04-16 12:55:44 -07:00
2024-04-16 18:01:20 -07:00
def get_key ( keys : list ) :
try :
2024-05-02 08:48:53 -07:00
value = data
2024-04-16 18:01:20 -07:00
for key in keys :
2024-05-02 08:48:53 -07:00
value = value [ key ]
return value or { }
except ( KeyError , TypeError , IndexError ) :
2024-04-30 13:29:54 -07:00
return { }
2024-04-16 18:01:20 -07:00
2024-05-11 21:35:29 -07:00
agents = get_key ( [ " data " , " home " , " advertisers " ] )
advertisers = get_key ( [ " data " , " home " , " consumer_advertisers " ] )
2024-04-16 18:01:20 -07:00
schools = get_key ( [ " data " , " home " , " nearbySchools " , " schools " ] )
2024-04-30 13:29:54 -07:00
assessed_value = get_key ( [ " data " , " home " , " taxHistory " , 0 , " assessment " , " total " ] )
estimated_value = get_key ( [ " data " , " home " , " estimates " , " currentValues " , 0 , " estimate " ] )
2024-04-16 12:55:44 -07:00
2024-05-02 08:48:53 -07:00
agents = [ Agent (
name = ad [ " name " ] ,
email = ad [ " email " ] ,
phones = ad [ " phones " ]
2024-05-11 21:35:29 -07:00
) for ad in agents ]
brokers = [ Broker (
name = ad [ " name " ] ,
phone = ad [ " phone " ] ,
website = ad [ " href " ]
) for ad in advertisers if ad . get ( " type " ) != " Agent " ]
2024-04-30 13:29:54 -07:00
2024-05-02 08:48:53 -07:00
schools = [ school [ " district " ] [ " name " ] for school in schools if school [ ' district ' ] . get ( ' name ' ) ]
2024-04-30 13:29:54 -07:00
return {
" agents " : agents if agents else None ,
2024-05-11 21:35:29 -07:00
" brokers " : brokers if brokers else None ,
2024-04-30 13:29:54 -07:00
" schools " : schools if schools else None ,
" assessed_value " : assessed_value if assessed_value else None ,
" estimated_value " : estimated_value if estimated_value else None ,
}
2024-04-16 12:55:44 -07:00
2023-10-04 06:58:55 -07:00
@staticmethod
def _parse_neighborhoods ( result : dict ) - > Optional [ str ] :
neighborhoods_list = [ ]
neighborhoods = result [ " location " ] . get ( " neighborhoods " , [ ] )
if neighborhoods :
for neighborhood in neighborhoods :
name = neighborhood . get ( " name " )
if name :
neighborhoods_list . append ( name )
return " , " . join ( neighborhoods_list ) if neighborhoods_list else None
@staticmethod
2023-10-18 16:32:43 -07:00
def handle_none_safely ( address_part ) :
if address_part is None :
return " "
return address_part
2024-05-02 08:48:53 -07:00
@staticmethod
def _parse_address ( result : dict , search_type ) :
2023-10-04 06:58:55 -07:00
if search_type == " general_search " :
2024-04-16 12:55:44 -07:00
address = result [ " location " ] [ " address " ]
2023-10-18 16:32:43 -07:00
else :
address = result [ " address " ]
2023-10-04 06:58:55 -07:00
return Address (
2024-04-16 12:55:44 -07:00
street = " " . join (
2024-05-02 08:48:53 -07:00
part for part in [
address . get ( " street_number " ) ,
address . get ( " street_direction " ) ,
address . get ( " street_name " ) ,
address . get ( " street_suffix " ) ,
] if part is not None
2024-04-16 12:55:44 -07:00
) . strip ( ) ,
2023-10-18 16:32:43 -07:00
unit = address [ " unit " ] ,
city = address [ " city " ] ,
state = address [ " state_code " ] ,
zip = address [ " postal_code " ] ,
2023-10-04 06:58:55 -07:00
)
@staticmethod
def _parse_description ( result : dict ) - > Description :
description_data = result . get ( " description " , { } )
2023-10-04 20:33:21 -07:00
if description_data is None or not isinstance ( description_data , dict ) :
description_data = { }
style = description_data . get ( " type " , " " )
if style is not None :
style = style . upper ( )
2023-11-24 11:41:46 -08:00
primary_photo = " "
if result and " primary_photo " in result :
primary_photo_info = result [ " primary_photo " ]
if primary_photo_info and " href " in primary_photo_info :
primary_photo_href = primary_photo_info [ " href " ]
primary_photo = primary_photo_href . replace ( " s.jpg " , " od-w480_h360_x2.webp?w=1080&q=75 " )
2023-10-04 06:58:55 -07:00
return Description (
2023-11-24 11:41:46 -08:00
primary_photo = primary_photo ,
alt_photos = RealtorScraper . process_alt_photos ( result . get ( " photos " ) ) ,
2024-03-13 19:17:17 -07:00
style = PropertyType ( style ) if style else None ,
2023-10-04 06:58:55 -07:00
beds = description_data . get ( " beds " ) ,
baths_full = description_data . get ( " baths_full " ) ,
baths_half = description_data . get ( " baths_half " ) ,
sqft = description_data . get ( " sqft " ) ,
lot_sqft = description_data . get ( " lot_sqft " ) ,
2024-05-02 08:48:53 -07:00
sold_price = description_data . get ( " sold_price " ) if result . get ( ' last_sold_date ' ) or result [ " list_price " ] != description_data . get ( " sold_price " ) else None , #: has a sold date or list and sold price are different
2023-10-04 06:58:55 -07:00
year_built = description_data . get ( " year_built " ) ,
garage = description_data . get ( " garage " ) ,
stories = description_data . get ( " stories " ) ,
2024-04-20 15:44:28 -07:00
text = description_data . get ( " text " ) ,
2023-10-04 08:11:53 -07:00
)
2023-10-09 09:00:36 -07:00
@staticmethod
2023-10-09 09:02:51 -07:00
def calculate_days_on_mls ( result : dict ) - > Optional [ int ] :
2023-10-09 09:00:36 -07:00
list_date_str = result . get ( " list_date " )
list_date = datetime . strptime ( list_date_str . split ( " T " ) [ 0 ] , " % Y- % m- %d " ) if list_date_str else None
last_sold_date_str = result . get ( " last_sold_date " )
last_sold_date = datetime . strptime ( last_sold_date_str , " % Y- % m- %d " ) if last_sold_date_str else None
today = datetime . now ( )
if list_date :
2024-04-16 12:55:44 -07:00
if result [ " status " ] == " sold " :
2023-10-09 09:00:36 -07:00
if last_sold_date :
days = ( last_sold_date - list_date ) . days
if days > = 0 :
return days
2024-04-16 12:55:44 -07:00
elif result [ " status " ] in ( " for_sale " , " for_rent " ) :
2023-10-09 09:00:36 -07:00
days = ( today - list_date ) . days
if days > = 0 :
return days
2023-11-24 11:41:46 -08:00
@staticmethod
def process_alt_photos ( photos_info ) :
try :
alt_photos = [ ]
if photos_info :
for photo_info in photos_info :
href = photo_info . get ( " href " , " " )
alt_photo_href = href . replace ( " s.jpg " , " od-w480_h360_x2.webp?w=1080&q=75 " )
alt_photos . append ( alt_photo_href )
return alt_photos
except Exception :
pass