refator(realtor): fit to updated models
parent
ffd3ce6aed
commit
869d7e7c51
|
@ -70,7 +70,10 @@ def process_result(result: Property) -> pd.DataFrame:
|
||||||
|
|
||||||
prop_data["site_name"] = prop_data["site_name"].value
|
prop_data["site_name"] = prop_data["site_name"].value
|
||||||
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
|
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
|
||||||
|
if "property_type" in prop_data and prop_data["property_type"] is not None:
|
||||||
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
||||||
|
else:
|
||||||
|
prop_data["property_type"] = None
|
||||||
if "address" in prop_data:
|
if "address" in prop_data:
|
||||||
address_data = prop_data["address"]
|
address_data = prop_data["address"]
|
||||||
prop_data["street_address"] = address_data.street_address
|
prop_data["street_address"] = address_data.street_address
|
||||||
|
@ -108,7 +111,7 @@ def scrape_property(
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
location=location,
|
location=location,
|
||||||
listing_type=ListingType[listing_type.upper()],
|
listing_type=ListingType[listing_type.upper()],
|
||||||
site_name=SiteName[site_name.upper()],
|
site_name=SiteName.get_by_value(site_name.lower()),
|
||||||
)
|
)
|
||||||
|
|
||||||
site = _scrapers[site_name.lower()](scraper_input)
|
site = _scrapers[site_name.lower()](scraper_input)
|
||||||
|
|
|
@ -7,6 +7,13 @@ class SiteName(Enum):
|
||||||
REDFIN = "redfin"
|
REDFIN = "redfin"
|
||||||
REALTOR = "realtor.com"
|
REALTOR = "realtor.com"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_by_value(cls, value):
|
||||||
|
for item in cls:
|
||||||
|
if item.value == value:
|
||||||
|
return item
|
||||||
|
raise ValueError(f"{value} not found in {cls}")
|
||||||
|
|
||||||
|
|
||||||
class ListingType(Enum):
|
class ListingType(Enum):
|
||||||
FOR_SALE = "FOR_SALE"
|
FOR_SALE = "FOR_SALE"
|
||||||
|
@ -57,14 +64,13 @@ class Address:
|
||||||
country: str | None = None
|
country: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Property:
|
class Property:
|
||||||
property_url: str
|
property_url: str
|
||||||
site_name: SiteName
|
site_name: SiteName
|
||||||
listing_type: ListingType
|
listing_type: ListingType
|
||||||
property_type: PropertyType
|
|
||||||
address: Address
|
address: Address
|
||||||
|
property_type: PropertyType | None = None
|
||||||
|
|
||||||
# house for sale
|
# house for sale
|
||||||
price: int | None = None
|
price: int | None = None
|
||||||
|
@ -78,7 +84,6 @@ class Property:
|
||||||
stories: int | None = None
|
stories: int | None = None
|
||||||
year_built: int | None = None
|
year_built: int | None = None
|
||||||
price_per_sqft: int | None = None
|
price_per_sqft: int | None = None
|
||||||
year_built: int | None = None
|
|
||||||
mls_id: str | None = None
|
mls_id: str | None = None
|
||||||
|
|
||||||
agent_name: str | None = None
|
agent_name: str | None = None
|
||||||
|
|
|
@ -3,6 +3,7 @@ from ..models import Property, Address
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from typing import Any, Generator
|
from typing import Any, Generator
|
||||||
from ....exceptions import NoResultsFound
|
from ....exceptions import NoResultsFound
|
||||||
|
from ....utils import parse_address_two
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,7 +30,7 @@ class RealtorScraper(Scraper):
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"input": self.location,
|
"input": self.location,
|
||||||
"client_id": self.listing_type.value.replace('_', '-'),
|
"client_id": self.listing_type.value.lower().replace("_", "-"),
|
||||||
"limit": "1",
|
"limit": "1",
|
||||||
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
||||||
}
|
}
|
||||||
|
@ -96,46 +97,57 @@ class RealtorScraper(Scraper):
|
||||||
}
|
}
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
variables = {
|
variables = {"property_id": property_id}
|
||||||
'property_id': property_id
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
'query': query,
|
"query": query,
|
||||||
'variables': variables,
|
"variables": variables,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self.session.post(self.search_url, json=payload)
|
response = self.session.post(self.search_url, json=payload)
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
property_info = response_json['data']['property']
|
property_info = response_json["data"]["property"]
|
||||||
|
street_address = property_info["address"]["line"]
|
||||||
|
unit = parse_address_two(street_address)
|
||||||
|
|
||||||
return [Property(
|
return [
|
||||||
|
Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
address=Address(
|
address=Address(
|
||||||
address_one=property_info['address']['line'],
|
street_address=street_address,
|
||||||
city=property_info['address']['city'],
|
city=property_info["address"]["city"],
|
||||||
state=property_info['address']['state_code'],
|
state=property_info["address"]["state_code"],
|
||||||
zip_code=property_info['address']['postal_code'],
|
zip_code=property_info["address"]["postal_code"],
|
||||||
|
unit=unit,
|
||||||
|
country="USA",
|
||||||
),
|
),
|
||||||
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
beds=property_info['basic']['beds'],
|
+ property_info["details"]["permalink"],
|
||||||
baths=property_info['basic']['baths'],
|
beds=property_info["basic"]["beds"],
|
||||||
stories=property_info['details']['stories'],
|
baths=property_info["basic"]["baths"],
|
||||||
year_built=property_info['details']['year_built'],
|
stories=property_info["details"]["stories"],
|
||||||
square_feet=property_info['basic']['sqft'],
|
year_built=property_info["details"]["year_built"],
|
||||||
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
|
square_feet=property_info["basic"]["sqft"],
|
||||||
if property_info['basic']['sqft'] is not None and
|
price_per_sqft=property_info["basic"]["price"]
|
||||||
property_info['basic']['price'] is not None
|
// property_info["basic"]["sqft"]
|
||||||
|
if property_info["basic"]["sqft"] is not None
|
||||||
|
and property_info["basic"]["price"] is not None
|
||||||
else None,
|
else None,
|
||||||
price=property_info['basic']['price'],
|
price=property_info["basic"]["price"],
|
||||||
mls_id=property_id,
|
mls_id=property_id,
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
|
lot_area_value=property_info["public_record"]["lot_size"]
|
||||||
)]
|
if property_info["public_record"] is not None
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
|
def handle_area(
|
||||||
query = """query Home_search(
|
self, variables: dict, return_total: bool = False
|
||||||
|
) -> list[Property] | int:
|
||||||
|
query = (
|
||||||
|
"""query Home_search(
|
||||||
$city: String,
|
$city: String,
|
||||||
$county: [String],
|
$county: [String],
|
||||||
$state_code: String,
|
$state_code: String,
|
||||||
|
@ -193,42 +205,57 @@ class RealtorScraper(Scraper):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}""" % self.listing_type.value
|
}"""
|
||||||
|
% self.listing_type.value.lower()
|
||||||
|
)
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
'query': query,
|
"query": query,
|
||||||
'variables': variables,
|
"variables": variables,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self.session.post(self.search_url, json=payload)
|
response = self.session.post(self.search_url, json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|
||||||
if return_total:
|
if return_total:
|
||||||
return response_json['data']['home_search']['total']
|
return response_json["data"]["home_search"]["total"]
|
||||||
|
|
||||||
properties: list[Property] = []
|
properties: list[Property] = []
|
||||||
|
|
||||||
for result in response_json['data']['home_search']['results']:
|
if (
|
||||||
|
response_json is None
|
||||||
|
or "data" not in response_json
|
||||||
|
or response_json["data"] is None
|
||||||
|
or "home_search" not in response_json["data"]
|
||||||
|
or response_json["data"]["home_search"] is None
|
||||||
|
or "results" not in response_json["data"]["home_search"]
|
||||||
|
):
|
||||||
|
return []
|
||||||
|
|
||||||
|
for result in response_json["data"]["home_search"]["results"]:
|
||||||
realty_property = Property(
|
realty_property = Property(
|
||||||
address=Address(
|
address=Address(
|
||||||
address_one=result['location']['address']['line'],
|
street_address=result["location"]["address"]["line"],
|
||||||
city=result['location']['address']['city'],
|
city=result["location"]["address"]["city"],
|
||||||
state=result['location']['address']['state_code'],
|
state=result["location"]["address"]["state_code"],
|
||||||
zip_code=result['location']['address']['postal_code'],
|
zip_code=result["location"]["address"]["postal_code"],
|
||||||
address_two=result['location']['address']['unit'],
|
unit=result["location"]["address"]["unit"],
|
||||||
|
country="USA",
|
||||||
),
|
),
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
beds=result['description']['beds'],
|
+ result["property_id"],
|
||||||
baths=result['description']['baths'],
|
beds=result["description"]["beds"],
|
||||||
stories=result['description']['stories'],
|
baths=result["description"]["baths"],
|
||||||
year_built=result['description']['year_built'],
|
stories=result["description"]["stories"],
|
||||||
square_feet=result['description']['sqft'],
|
year_built=result["description"]["year_built"],
|
||||||
price_per_square_foot=result['price_per_sqft'],
|
square_feet=result["description"]["sqft"],
|
||||||
price=result['list_price'],
|
price_per_sqft=result["price_per_sqft"],
|
||||||
mls_id=result['property_id'],
|
price=result["list_price"],
|
||||||
|
mls_id=result["property_id"],
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
lot_size=result['description']['lot_sqft'],
|
lot_area_value=result["description"]["lot_sqft"],
|
||||||
)
|
)
|
||||||
|
|
||||||
properties.append(realty_property)
|
properties.append(realty_property)
|
||||||
|
@ -239,17 +266,17 @@ class RealtorScraper(Scraper):
|
||||||
location_info = self.handle_location()
|
location_info = self.handle_location()
|
||||||
location_type = location_info["area_type"]
|
location_type = location_info["area_type"]
|
||||||
|
|
||||||
if location_type == 'address':
|
if location_type == "address":
|
||||||
property_id = location_info['mpr_id']
|
property_id = location_info["mpr_id"]
|
||||||
return self.handle_address(property_id)
|
return self.handle_address(property_id)
|
||||||
|
|
||||||
offset = 0
|
offset = 0
|
||||||
search_variables = {
|
search_variables = {
|
||||||
'city': location_info.get('city'),
|
"city": location_info.get("city"),
|
||||||
'county': location_info.get('county'),
|
"county": location_info.get("county"),
|
||||||
'state_code': location_info.get('state_code'),
|
"state_code": location_info.get("state_code"),
|
||||||
'postal_code': location_info.get('postal_code'),
|
"postal_code": location_info.get("postal_code"),
|
||||||
'offset': offset,
|
"offset": offset,
|
||||||
}
|
}
|
||||||
|
|
||||||
total = self.handle_area(search_variables, return_total=True)
|
total = self.handle_area(search_variables, return_total=True)
|
||||||
|
@ -258,8 +285,11 @@ class RealtorScraper(Scraper):
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
futures = [
|
futures = [
|
||||||
executor.submit(
|
executor.submit(
|
||||||
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
|
self.handle_area,
|
||||||
) for i in range(0, total, 200)
|
variables=search_variables | {"offset": i},
|
||||||
|
return_total=False,
|
||||||
|
)
|
||||||
|
for i in range(0, total, 200)
|
||||||
]
|
]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
|
|
|
@ -100,28 +100,27 @@ class RedfinScraper(Scraper):
|
||||||
address=Address(
|
address=Address(
|
||||||
street_address=" ".join(
|
street_address=" ".join(
|
||||||
[
|
[
|
||||||
building['address']['streetNumber'],
|
building["address"]["streetNumber"],
|
||||||
building['address']['directionalPrefix'],
|
building["address"]["directionalPrefix"],
|
||||||
building['address']['streetName'],
|
building["address"]["streetName"],
|
||||||
building['address']['streetType'],
|
building["address"]["streetType"],
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
city=building['address']['city'],
|
city=building["address"]["city"],
|
||||||
state=building['address']['stateOrProvinceCode'],
|
state=building["address"]["stateOrProvinceCode"],
|
||||||
zip_code=building['address']['postalCode'],
|
zip_code=building["address"]["postalCode"],
|
||||||
unit=" ".join(
|
unit=" ".join(
|
||||||
[
|
[
|
||||||
building['address']['unitType'],
|
building["address"]["unitType"],
|
||||||
building['address']['unitValue'],
|
building["address"]["unitValue"],
|
||||||
]
|
]
|
||||||
)
|
),
|
||||||
),
|
),
|
||||||
property_url="https://www.redfin.com{}".format(building["url"]),
|
property_url="https://www.redfin.com{}".format(building["url"]),
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
bldg_unit_count=building["numUnitsForSale"],
|
bldg_unit_count=building["numUnitsForSale"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def handle_address(self, home_id: str):
|
def handle_address(self, home_id: str):
|
||||||
"""
|
"""
|
||||||
EPs:
|
EPs:
|
||||||
|
@ -160,7 +159,8 @@ class RedfinScraper(Scraper):
|
||||||
homes = [
|
homes = [
|
||||||
self._parse_home(home) for home in response_json["payload"]["homes"]
|
self._parse_home(home) for home in response_json["payload"]["homes"]
|
||||||
] + [
|
] + [
|
||||||
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
|
self._parse_building(building)
|
||||||
|
for building in response_json["payload"]["buildings"].values()
|
||||||
]
|
]
|
||||||
|
|
||||||
return homes
|
return homes
|
||||||
|
|
|
@ -98,8 +98,7 @@ class ZillowScraper(Scraper):
|
||||||
else filter_state_sold
|
else filter_state_sold
|
||||||
)
|
)
|
||||||
|
|
||||||
payload = json.dumps(
|
payload = {
|
||||||
{
|
|
||||||
"searchQueryState": {
|
"searchQueryState": {
|
||||||
"pagination": {},
|
"pagination": {},
|
||||||
"isMapVisible": True,
|
"isMapVisible": True,
|
||||||
|
@ -116,8 +115,7 @@ class ZillowScraper(Scraper):
|
||||||
"wants": {"cat1": ["mapResults"]},
|
"wants": {"cat1": ["mapResults"]},
|
||||||
"isDebugRequest": False,
|
"isDebugRequest": False,
|
||||||
}
|
}
|
||||||
)
|
resp = self.session.put(url, headers=self._get_headers(), json=payload)
|
||||||
resp = self.session.put(url, headers=self._get_headers(), data=payload)
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
a = resp.json()
|
a = resp.json()
|
||||||
return self._parse_properties(resp.json())
|
return self._parse_properties(resp.json())
|
||||||
|
@ -176,9 +174,7 @@ class ZillowScraper(Scraper):
|
||||||
and result["variableData"]["type"] == "TIME_ON_INFO"
|
and result["variableData"]["type"] == "TIME_ON_INFO"
|
||||||
else None,
|
else None,
|
||||||
"img_src": result.get("imgSrc"),
|
"img_src": result.get("imgSrc"),
|
||||||
"price_per_sqft": int(
|
"price_per_sqft": int(home_info["price"] // home_info["livingArea"])
|
||||||
home_info["price"] // home_info["livingArea"]
|
|
||||||
)
|
|
||||||
if "livingArea" in home_info and "price" in home_info
|
if "livingArea" in home_info and "price" in home_info
|
||||||
else None,
|
else None,
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,9 +3,17 @@ from homeharvest import scrape_property
|
||||||
|
|
||||||
def test_realtor():
|
def test_realtor():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"),
|
scrape_property(
|
||||||
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format
|
location="2530 Al Lipscomb Way",
|
||||||
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format
|
site_name="realtor.com",
|
||||||
|
listing_type="for_sale",
|
||||||
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent"
|
||||||
|
), #: does not support "city, state, USA" format
|
||||||
|
scrape_property(
|
||||||
|
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
|
||||||
|
), #: does not support "city, state, USA" format
|
||||||
scrape_property(location="85281", site_name="realtor.com"),
|
scrape_property(location="85281", site_name="realtor.com"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -3,9 +3,15 @@ from homeharvest import scrape_property
|
||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"),
|
scrape_property(
|
||||||
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"),
|
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
|
||||||
scrape_property(location="Dallas, TX, USA", site_name="redfin"),
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent"
|
||||||
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
|
||||||
|
),
|
||||||
scrape_property(location="85281", site_name="redfin"),
|
scrape_property(location="85281", site_name="redfin"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -3,9 +3,15 @@ from homeharvest import scrape_property
|
||||||
|
|
||||||
def test_zillow():
|
def test_zillow():
|
||||||
results = [
|
results = [
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"),
|
scrape_property(
|
||||||
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"),
|
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
|
||||||
scrape_property(location="Dallas, TX, USA", site_name="zillow"),
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent"
|
||||||
|
),
|
||||||
|
scrape_property(
|
||||||
|
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
|
||||||
|
),
|
||||||
scrape_property(location="85281", site_name="zillow"),
|
scrape_property(location="85281", site_name="zillow"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue