- finished realtor

pull/1/head
Zachary Hampton 2023-09-18 08:16:59 -07:00
parent 905cfcae2c
commit ba9fe806a7
5 changed files with 236 additions and 15 deletions

View File

@ -69,9 +69,9 @@ def process_result(result: Union[Building, Property]) -> pd.DataFrame:
prop_data = result.__dict__ prop_data = result.__dict__
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["site_name"] = prop_data["site_name"].value prop_data["site_name"] = prop_data["site_name"]
prop_data["listing_type"] = prop_data["listing_type"].value prop_data["listing_type"] = prop_data["listing_type"].value
prop_data["property_type"] = prop_data["property_type"].value.lower() prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data["property_type"] else None
prop_data["address_one"] = address_data.address_one prop_data["address_one"] = address_data.address_one
prop_data["city"] = address_data.city prop_data["city"] = address_data.city
prop_data["state"] = address_data.state prop_data["state"] = address_data.state
@ -90,13 +90,13 @@ def scrape_property(
location: str, location: str,
site_name: str, site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> Union[list[Building], list[Property]]: ) -> pd.DataFrame:
validate_input(site_name, listing_type) validate_input(site_name, listing_type)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType[listing_type.upper()], listing_type=ListingType[listing_type.upper()],
site_name=SiteName[site_name.upper()], site_name=site_name.lower(),
) )
site = _scrapers[site_name.lower()](scraper_input) site = _scrapers[site_name.lower()](scraper_input)

View File

@ -7,13 +7,15 @@ from .models import Property, ListingType, SiteName
class ScraperInput: class ScraperInput:
location: str location: str
listing_type: ListingType listing_type: ListingType
site_name: SiteName site_name: str
proxy_url: str | None = None proxy_url: str | None = None
class Scraper: class Scraper:
def __init__(self, scraper_input: ScraperInput): def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type
self.session = requests.Session() self.session = requests.Session()
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name

View File

@ -53,7 +53,7 @@ class Address:
@dataclass() @dataclass()
class Realty: class Realty:
site_name: SiteName site_name: str
address: Address address: Address
url: str url: str
listing_type: ListingType | None = None listing_type: ListingType | None = None
@ -68,7 +68,6 @@ class Property(Realty):
year_built: int | None = None year_built: int | None = None
square_feet: int | None = None square_feet: int | None = None
price_per_square_foot: int | None = None price_per_square_foot: int | None = None
year_built: int | None = None
mls_id: str | None = None mls_id: str | None = None
agent_name: str | None = None agent_name: str | None = None

View File

@ -1,12 +1,15 @@
import json import json
from ..models import Property, Address from ..models import Property, Address
from .. import Scraper from .. import Scraper
from typing import Any from typing import Any, Generator
from ....exceptions import NoResultsFound
from concurrent.futures import ThreadPoolExecutor, as_completed
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
def handle_location(self): def handle_location(self):
headers = { headers = {
@ -26,7 +29,7 @@ class RealtorScraper(Scraper):
params = { params = {
"input": self.location, "input": self.location,
"client_id": "for-sale", "client_id": self.listing_type.value.replace('_', '-'),
"limit": "1", "limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
} }
@ -38,14 +41,228 @@ class RealtorScraper(Scraper):
) )
response_json = response.json() response_json = response.json()
return response_json["autocomplete"][0] result = response_json["autocomplete"]
if result is None:
raise NoResultsFound("No results found for location: " + self.location)
return result[0]
def handle_address(self, property_id: str) -> list[Property]:
query = """query Property($property_id: ID!) {
property(id: $property_id) {
property_id
details {
date_updated
garage
permalink
year_built
stories
}
address {
address_validation_code
city
country
county
line
postal_code
state_code
street_direction
street_name
street_number
street_suffix
street_post_direction
unit_value
unit
unit_descriptor
zip
}
basic {
baths
beds
price
sqft
lot_sqft
type
sold_price
}
public_record {
lot_size
sqft
stories
units
year_built
}
}
}"""
variables = {
'property_id': property_id
}
payload = {
'query': query,
'variables': variables,
}
response = self.session.post(self.search_url, json=payload)
response_json = response.json()
property_info = response_json['data']['property']
return [Property(
site_name=self.site_name,
address=Address(
address_one=property_info['address']['line'],
city=property_info['address']['city'],
state=property_info['address']['state_code'],
zip_code=property_info['address']['postal_code'],
),
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
beds=property_info['basic']['beds'],
baths=property_info['basic']['baths'],
stories=property_info['details']['stories'],
year_built=property_info['details']['year_built'],
square_feet=property_info['basic']['sqft'],
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
if property_info['basic']['sqft'] is not None and
property_info['basic']['price'] is not None
else None,
price=property_info['basic']['price'],
mls_id=property_id,
listing_type=self.listing_type,
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
)]
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
query = """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int,
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
}
limit: 200
offset: $offset
) {
count
total
results {
property_id
description {
baths
beds
lot_sqft
sqft
text
sold_price
stories
year_built
garage
unit_number
floor_number
}
location {
address {
city
country
line
postal_code
state_code
state
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
}
list_price
price_per_sqft
source {
id
}
}
}
}""" % self.listing_type.value
payload = {
'query': query,
'variables': variables,
}
response = self.session.post(self.search_url, json=payload)
response_json = response.json()
if return_total:
return response_json['data']['home_search']['total']
properties: list[Property] = []
for result in response_json['data']['home_search']['results']:
realty_property = Property(
address=Address(
address_one=result['location']['address']['line'],
city=result['location']['address']['city'],
state=result['location']['address']['state_code'],
zip_code=result['location']['address']['postal_code'],
address_two=result['location']['address']['unit'],
),
site_name=self.site_name,
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
beds=result['description']['beds'],
baths=result['description']['baths'],
stories=result['description']['stories'],
year_built=result['description']['year_built'],
square_feet=result['description']['sqft'],
price_per_square_foot=result['price_per_sqft'],
price=result['list_price'],
mls_id=result['property_id'],
listing_type=self.listing_type,
lot_size=result['description']['lot_sqft'],
)
properties.append(realty_property)
return properties
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]
""" if location_type == 'address':
property types: property_id = location_info['mpr_id']
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes return self.handle_address(property_id)
"""
print("a") offset = 0
search_variables = {
'city': location_info.get('city'),
'county': location_info.get('county'),
'state_code': location_info.get('state_code'),
'postal_code': location_info.get('postal_code'),
'offset': offset,
}
total = self.handle_area(search_variables, return_total=True)
homes = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
) for i in range(0, total, 200)
]
for future in as_completed(futures):
homes.extend(future.result())
return homes

View File

@ -3,6 +3,9 @@ from homeharvest import scrape_property
def test_realtor(): def test_realtor():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"),
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format
scrape_property(location="85281", site_name="realtor.com"), scrape_property(location="85281", site_name="realtor.com"),
] ]