- finished realtor
parent
905cfcae2c
commit
ba9fe806a7
|
@ -69,9 +69,9 @@ def process_result(result: Union[Building, Property]) -> pd.DataFrame:
|
||||||
prop_data = result.__dict__
|
prop_data = result.__dict__
|
||||||
|
|
||||||
address_data = prop_data["address"]
|
address_data = prop_data["address"]
|
||||||
prop_data["site_name"] = prop_data["site_name"].value
|
prop_data["site_name"] = prop_data["site_name"]
|
||||||
prop_data["listing_type"] = prop_data["listing_type"].value
|
prop_data["listing_type"] = prop_data["listing_type"].value
|
||||||
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data["property_type"] else None
|
||||||
prop_data["address_one"] = address_data.address_one
|
prop_data["address_one"] = address_data.address_one
|
||||||
prop_data["city"] = address_data.city
|
prop_data["city"] = address_data.city
|
||||||
prop_data["state"] = address_data.state
|
prop_data["state"] = address_data.state
|
||||||
|
@ -90,13 +90,13 @@ def scrape_property(
|
||||||
location: str,
|
location: str,
|
||||||
site_name: str,
|
site_name: str,
|
||||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
||||||
) -> Union[list[Building], list[Property]]:
|
) -> pd.DataFrame:
|
||||||
validate_input(site_name, listing_type)
|
validate_input(site_name, listing_type)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
location=location,
|
location=location,
|
||||||
listing_type=ListingType[listing_type.upper()],
|
listing_type=ListingType[listing_type.upper()],
|
||||||
site_name=SiteName[site_name.upper()],
|
site_name=site_name.lower(),
|
||||||
)
|
)
|
||||||
|
|
||||||
site = _scrapers[site_name.lower()](scraper_input)
|
site = _scrapers[site_name.lower()](scraper_input)
|
||||||
|
|
|
@ -7,13 +7,15 @@ from .models import Property, ListingType, SiteName
|
||||||
class ScraperInput:
|
class ScraperInput:
|
||||||
location: str
|
location: str
|
||||||
listing_type: ListingType
|
listing_type: ListingType
|
||||||
site_name: SiteName
|
site_name: str
|
||||||
proxy_url: str | None = None
|
proxy_url: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, scraper_input: ScraperInput):
|
def __init__(self, scraper_input: ScraperInput):
|
||||||
self.location = scraper_input.location
|
self.location = scraper_input.location
|
||||||
|
self.listing_type = scraper_input.listing_type
|
||||||
|
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
self.site_name = scraper_input.site_name
|
self.site_name = scraper_input.site_name
|
||||||
|
|
|
@ -53,7 +53,7 @@ class Address:
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class Realty:
|
class Realty:
|
||||||
site_name: SiteName
|
site_name: str
|
||||||
address: Address
|
address: Address
|
||||||
url: str
|
url: str
|
||||||
listing_type: ListingType | None = None
|
listing_type: ListingType | None = None
|
||||||
|
@ -68,7 +68,6 @@ class Property(Realty):
|
||||||
year_built: int | None = None
|
year_built: int | None = None
|
||||||
square_feet: int | None = None
|
square_feet: int | None = None
|
||||||
price_per_square_foot: int | None = None
|
price_per_square_foot: int | None = None
|
||||||
year_built: int | None = None
|
|
||||||
mls_id: str | None = None
|
mls_id: str | None = None
|
||||||
|
|
||||||
agent_name: str | None = None
|
agent_name: str | None = None
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
import json
|
import json
|
||||||
from ..models import Property, Address
|
from ..models import Property, Address
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from typing import Any
|
from typing import Any, Generator
|
||||||
|
from ....exceptions import NoResultsFound
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
class RealtorScraper(Scraper):
|
class RealtorScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
|
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
|
||||||
|
|
||||||
def handle_location(self):
|
def handle_location(self):
|
||||||
headers = {
|
headers = {
|
||||||
|
@ -26,7 +29,7 @@ class RealtorScraper(Scraper):
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"input": self.location,
|
"input": self.location,
|
||||||
"client_id": "for-sale",
|
"client_id": self.listing_type.value.replace('_', '-'),
|
||||||
"limit": "1",
|
"limit": "1",
|
||||||
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
||||||
}
|
}
|
||||||
|
@ -38,14 +41,228 @@ class RealtorScraper(Scraper):
|
||||||
)
|
)
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
return response_json["autocomplete"][0]
|
result = response_json["autocomplete"]
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
raise NoResultsFound("No results found for location: " + self.location)
|
||||||
|
|
||||||
|
return result[0]
|
||||||
|
|
||||||
|
def handle_address(self, property_id: str) -> list[Property]:
|
||||||
|
query = """query Property($property_id: ID!) {
|
||||||
|
property(id: $property_id) {
|
||||||
|
property_id
|
||||||
|
details {
|
||||||
|
date_updated
|
||||||
|
garage
|
||||||
|
permalink
|
||||||
|
year_built
|
||||||
|
stories
|
||||||
|
}
|
||||||
|
address {
|
||||||
|
address_validation_code
|
||||||
|
city
|
||||||
|
country
|
||||||
|
county
|
||||||
|
line
|
||||||
|
postal_code
|
||||||
|
state_code
|
||||||
|
street_direction
|
||||||
|
street_name
|
||||||
|
street_number
|
||||||
|
street_suffix
|
||||||
|
street_post_direction
|
||||||
|
unit_value
|
||||||
|
unit
|
||||||
|
unit_descriptor
|
||||||
|
zip
|
||||||
|
}
|
||||||
|
basic {
|
||||||
|
baths
|
||||||
|
beds
|
||||||
|
price
|
||||||
|
sqft
|
||||||
|
lot_sqft
|
||||||
|
type
|
||||||
|
sold_price
|
||||||
|
}
|
||||||
|
public_record {
|
||||||
|
lot_size
|
||||||
|
sqft
|
||||||
|
stories
|
||||||
|
units
|
||||||
|
year_built
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
|
||||||
|
variables = {
|
||||||
|
'property_id': property_id
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'query': query,
|
||||||
|
'variables': variables,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = self.session.post(self.search_url, json=payload)
|
||||||
|
response_json = response.json()
|
||||||
|
|
||||||
|
property_info = response_json['data']['property']
|
||||||
|
|
||||||
|
return [Property(
|
||||||
|
site_name=self.site_name,
|
||||||
|
address=Address(
|
||||||
|
address_one=property_info['address']['line'],
|
||||||
|
city=property_info['address']['city'],
|
||||||
|
state=property_info['address']['state_code'],
|
||||||
|
zip_code=property_info['address']['postal_code'],
|
||||||
|
),
|
||||||
|
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
|
||||||
|
beds=property_info['basic']['beds'],
|
||||||
|
baths=property_info['basic']['baths'],
|
||||||
|
stories=property_info['details']['stories'],
|
||||||
|
year_built=property_info['details']['year_built'],
|
||||||
|
square_feet=property_info['basic']['sqft'],
|
||||||
|
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
|
||||||
|
if property_info['basic']['sqft'] is not None and
|
||||||
|
property_info['basic']['price'] is not None
|
||||||
|
else None,
|
||||||
|
price=property_info['basic']['price'],
|
||||||
|
mls_id=property_id,
|
||||||
|
listing_type=self.listing_type,
|
||||||
|
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
|
||||||
|
)]
|
||||||
|
|
||||||
|
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
|
||||||
|
query = """query Home_search(
|
||||||
|
$city: String,
|
||||||
|
$county: [String],
|
||||||
|
$state_code: String,
|
||||||
|
$postal_code: String
|
||||||
|
$offset: Int,
|
||||||
|
) {
|
||||||
|
home_search(
|
||||||
|
query: {
|
||||||
|
city: $city
|
||||||
|
county: $county
|
||||||
|
postal_code: $postal_code
|
||||||
|
state_code: $state_code
|
||||||
|
status: %s
|
||||||
|
}
|
||||||
|
limit: 200
|
||||||
|
offset: $offset
|
||||||
|
) {
|
||||||
|
count
|
||||||
|
total
|
||||||
|
results {
|
||||||
|
property_id
|
||||||
|
description {
|
||||||
|
baths
|
||||||
|
beds
|
||||||
|
lot_sqft
|
||||||
|
sqft
|
||||||
|
text
|
||||||
|
sold_price
|
||||||
|
stories
|
||||||
|
year_built
|
||||||
|
garage
|
||||||
|
unit_number
|
||||||
|
floor_number
|
||||||
|
}
|
||||||
|
location {
|
||||||
|
address {
|
||||||
|
city
|
||||||
|
country
|
||||||
|
line
|
||||||
|
postal_code
|
||||||
|
state_code
|
||||||
|
state
|
||||||
|
street_direction
|
||||||
|
street_name
|
||||||
|
street_number
|
||||||
|
street_post_direction
|
||||||
|
street_suffix
|
||||||
|
unit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
list_price
|
||||||
|
price_per_sqft
|
||||||
|
source {
|
||||||
|
id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}""" % self.listing_type.value
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'query': query,
|
||||||
|
'variables': variables,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = self.session.post(self.search_url, json=payload)
|
||||||
|
response_json = response.json()
|
||||||
|
|
||||||
|
if return_total:
|
||||||
|
return response_json['data']['home_search']['total']
|
||||||
|
|
||||||
|
properties: list[Property] = []
|
||||||
|
|
||||||
|
for result in response_json['data']['home_search']['results']:
|
||||||
|
realty_property = Property(
|
||||||
|
address=Address(
|
||||||
|
address_one=result['location']['address']['line'],
|
||||||
|
city=result['location']['address']['city'],
|
||||||
|
state=result['location']['address']['state_code'],
|
||||||
|
zip_code=result['location']['address']['postal_code'],
|
||||||
|
address_two=result['location']['address']['unit'],
|
||||||
|
),
|
||||||
|
site_name=self.site_name,
|
||||||
|
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
|
||||||
|
beds=result['description']['beds'],
|
||||||
|
baths=result['description']['baths'],
|
||||||
|
stories=result['description']['stories'],
|
||||||
|
year_built=result['description']['year_built'],
|
||||||
|
square_feet=result['description']['sqft'],
|
||||||
|
price_per_square_foot=result['price_per_sqft'],
|
||||||
|
price=result['list_price'],
|
||||||
|
mls_id=result['property_id'],
|
||||||
|
listing_type=self.listing_type,
|
||||||
|
lot_size=result['description']['lot_sqft'],
|
||||||
|
)
|
||||||
|
|
||||||
|
properties.append(realty_property)
|
||||||
|
|
||||||
|
return properties
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
location_info = self.handle_location()
|
location_info = self.handle_location()
|
||||||
location_type = location_info["area_type"]
|
location_type = location_info["area_type"]
|
||||||
|
|
||||||
"""
|
if location_type == 'address':
|
||||||
property types:
|
property_id = location_info['mpr_id']
|
||||||
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
|
return self.handle_address(property_id)
|
||||||
"""
|
|
||||||
print("a")
|
offset = 0
|
||||||
|
search_variables = {
|
||||||
|
'city': location_info.get('city'),
|
||||||
|
'county': location_info.get('county'),
|
||||||
|
'state_code': location_info.get('state_code'),
|
||||||
|
'postal_code': location_info.get('postal_code'),
|
||||||
|
'offset': offset,
|
||||||
|
}
|
||||||
|
|
||||||
|
total = self.handle_area(search_variables, return_total=True)
|
||||||
|
|
||||||
|
homes = []
|
||||||
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(
|
||||||
|
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
|
||||||
|
) for i in range(0, total, 200)
|
||||||
|
]
|
||||||
|
|
||||||
|
for future in as_completed(futures):
|
||||||
|
homes.extend(future.result())
|
||||||
|
|
||||||
|
return homes
|
||||||
|
|
|
@ -3,6 +3,9 @@ from homeharvest import scrape_property
|
||||||
|
|
||||||
def test_realtor():
|
def test_realtor():
|
||||||
results = [
|
results = [
|
||||||
|
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"),
|
||||||
|
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format
|
||||||
|
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format
|
||||||
scrape_property(location="85281", site_name="realtor.com"),
|
scrape_property(location="85281", site_name="realtor.com"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue