Compare commits

..

13 Commits

Author SHA1 Message Date
Zachary Hampton
ec036bb989 - optimizations & updated realtor headers 2024-05-20 12:13:30 -07:00
Zachary Hampton
aacd168545 - alt photos bug fix 2024-05-18 17:47:55 -07:00
Zachary Hampton
0d70007000 - alt photos bug fix 2024-05-16 23:04:07 -07:00
Zachary Hampton
018d3fbac4 - Python 3.9 support (tested) (could potentially work for lower versions, but I have not validated such) 2024-05-14 19:13:04 -07:00
Zachary Hampton
803fd618e9 - data cleaning & CONDOP bug fixes 2024-05-12 21:12:12 -07:00
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
Zachary Hampton
3458a08383 - broker data 2024-05-11 21:35:29 -07:00
Zachary Hampton
c3e24a4ce0 - extra_property_details parameter
- updated docs
- classified exception
2024-05-02 09:04:49 -07:00
Zachary Hampton
46985dcee4 - various data quality fixes (including #70) 2024-05-02 08:48:53 -07:00
Cullen Watson
04ae968716 enh: assessed/estimated value (#77) 2024-04-30 15:29:54 -05:00
Cullen
c5b15e9be5 chore: version 2024-04-20 17:45:29 -05:00
joecryptotoo
7a525caeb8 added county, fips, and text desciption fields (#72) 2024-04-20 17:44:28 -05:00
Cullen Watson
7246703999 Schools (#69) 2024-04-16 20:01:20 -05:00
9 changed files with 296 additions and 72 deletions

View File

@@ -21,7 +21,7 @@
```bash ```bash
pip install -U homeharvest pip install -U homeharvest
``` ```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
## Usage ## Usage
@@ -43,7 +43,6 @@ properties = scrape_property(
# date_from="2023-05-01", # alternative to past_days # date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28", # date_to="2023-05-28",
# foreclosure=True # foreclosure=True
# mls_only=True, # only fetch MLS listings # mls_only=True, # only fetch MLS listings
) )
print(f"Number of properties: {len(properties)}") print(f"Number of properties: {len(properties)}")
@@ -92,6 +91,8 @@ Optional
├── foreclosure (True/False): If set, fetches only foreclosures ├── foreclosure (True/False): If set, fetches only foreclosures
└── proxy (string): In format 'http://user:pass@host:port' └── proxy (string): In format 'http://user:pass@host:port'
└── extra_property_data (bool): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
``` ```
### Property Schema ### Property Schema
@@ -134,15 +135,18 @@ Property
├── Location Details: ├── Location Details:
│ ├── latitude │ ├── latitude
│ ├── longitude │ ├── longitude
│ ├── nearby_schools
├── Agent Info: ├── Agent Info:
│ ├── agent │ ├── agent
│ ├── broker │ ├── agent_email
│ └── broker_phone │ └── agent_phone
``` ```
### Exceptions ### Exceptions
The following exceptions may be raised when using HomeHarvest: The following exceptions may be raised when using HomeHarvest:
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
- `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD - `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD.
- `AuthenticationError` - Realtor.com token request failed.

View File

@@ -13,9 +13,10 @@ def scrape_property(
mls_only: bool = False, mls_only: bool = False,
past_days: int = None, past_days: int = None,
proxy: str = None, proxy: str = None,
date_from: str = None, date_from: str = None, #: TODO: Switch to one parameter, Date, with date_from and date_to, pydantic validation
date_to: str = None, date_to: str = None,
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@@ -23,9 +24,11 @@ def scrape_property(
:param listing_type: Listing Type (for_sale, for_rent, sold) :param listing_type: Listing Type (for_sale, for_rent, sold)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
:param proxy: Proxy to use for scraping
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days. :param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28 :param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param proxy: Proxy to use for scraping :param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
@@ -40,15 +43,17 @@ def scrape_property(
date_from=date_from, date_from=date_from,
date_to=date_to, date_to=date_to,
foreclosure=foreclosure, foreclosure=foreclosure,
extra_property_data=extra_property_data,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
results = site.search() results = site.search()
properties_dfs = [process_result(result) for result in results] properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})

View File

@@ -1,7 +1,12 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName
import json
@dataclass @dataclass
@@ -9,33 +14,40 @@ class ScraperInput:
location: str location: str
listing_type: ListingType listing_type: ListingType
radius: float | None = None radius: float | None = None
mls_only: bool | None = None mls_only: bool | None = False
proxy: str | None = None proxy: str | None = None
last_x_days: int | None = None last_x_days: int | None = None
date_from: str | None = None date_from: str | None = None
date_to: str | None = None date_to: str | None = None
foreclosure: bool | None = None foreclosure: bool | None = False
extra_property_data: bool | None = True
class Scraper: class Scraper:
session = None
def __init__( def __init__(
self, self,
scraper_input: ScraperInput, scraper_input: ScraperInput,
session: requests.Session = None,
): ):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
if not session: if not self.session:
self.session = requests.Session() Scraper.session = requests.Session()
self.session.headers.update( retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)
adapter = HTTPAdapter(max_retries=retries)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{ {
"auth": f"Bearer {self.get_access_token()}", "auth": f"Bearer {self.get_access_token()}",
"apollographql-client-name": "com.move.Realtor-apollo-ios", "apollographql-client-name": "com.move.Realtor-apollo-ios",
} }
) )
else:
self.session = session
if scraper_input.proxy: if scraper_input.proxy:
proxy_url = scraper_input.proxy proxy_url = scraper_input.proxy
@@ -49,6 +61,7 @@ class Scraper:
self.date_from = scraper_input.date_from self.date_from = scraper_input.date_from
self.date_to = scraper_input.date_to self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data
def search(self) -> list[Property]: ... def search(self) -> list[Property]: ...
@@ -57,19 +70,34 @@ class Scraper:
def handle_location(self): ... def handle_location(self): ...
def get_access_token(self): @staticmethod
url = "https://graph.realtor.com/auth/token" def get_access_token():
device_id = str(uuid.uuid4()).upper()
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}' response = requests.post(
headers = { "https://graph.realtor.com/auth/token",
"Host": "graph.realtor.com", headers={
"x-client-version": "24.20.4.149916", 'Host': 'graph.realtor.com',
"accept": "*/*", 'Accept': '*/*',
"content-type": "Application/json", 'Content-Type': 'Application/json',
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0", 'X-Client-ID': 'rdc_mobile_native,iphone',
"accept-language": "en-US,en;q=0.9", 'X-Visitor-ID': device_id,
} 'X-Client-Version': '24.21.23.679885',
response = requests.post(url, headers=headers, data=payload) 'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0',
},
data=json.dumps({
"grant_type": "device_mobile",
"device_id": device_id,
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone"
}))
data = response.json() data = response.json()
return data["access_token"]
if not (access_token := data.get("access_token")):
raise AuthenticationError(
"Failed to get access token, use a proxy/vpn or wait a moment and try again.",
response=response
)
return access_token

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Optional from typing import Optional
@@ -23,6 +24,12 @@ class ListingType(Enum):
SOLD = "SOLD" SOLD = "SOLD"
@dataclass
class Agent:
name: str | None = None
phone: str | None = None
class PropertyType(Enum): class PropertyType(Enum):
APARTMENT = "APARTMENT" APARTMENT = "APARTMENT"
BUILDING = "BUILDING" BUILDING = "BUILDING"
@@ -30,6 +37,7 @@ class PropertyType(Enum):
CONDO_TOWNHOME = "CONDO_TOWNHOME" CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP" CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO" CONDO = "CONDO"
CONDOP = "CONDOP"
CONDOS = "CONDOS" CONDOS = "CONDOS"
COOP = "COOP" COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX" DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
@@ -46,6 +54,7 @@ class PropertyType(Enum):
@dataclass @dataclass
class Address: class Address:
full_line: str | None = None
street: str | None = None street: str | None = None
unit: str | None = None unit: str | None = None
city: str | None = None city: str | None = None
@@ -67,12 +76,30 @@ class Description:
year_built: int | None = None year_built: int | None = None
garage: float | None = None garage: float | None = None
stories: int | None = None stories: int | None = None
text: str | None = None
@dataclass
class AgentPhone: #: For documentation purposes only (at the moment)
number: str | None = None
type: str | None = None
primary: bool | None = None
ext: str | None = None
@dataclass @dataclass
class Agent: class Agent:
name: str | None = None
phones: list[dict] | AgentPhone | None = None
email: str | None = None
href: str | None = None
@dataclass
class Broker:
name: str | None = None name: str | None = None
phone: str | None = None phone: str | None = None
website: str | None = None
@dataclass @dataclass
@@ -95,5 +122,10 @@ class Property:
latitude: float | None = None latitude: float | None = None
longitude: float | None = None longitude: float | None = None
neighborhoods: Optional[str] = None neighborhoods: Optional[str] = None
county: Optional[str] = None
agents: list[Agent] = None fips_code: Optional[str] = None
agents: list[Agent] | None = None
brokers: list[Broker] | None = None
nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None

View File

@@ -4,13 +4,13 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com This module implements the scraper for realtor.com
""" """
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from .. import Scraper from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id listing_id
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -142,7 +143,7 @@ class RealtorScraper(Scraper):
days_on_mls = None days_on_mls = None
property_id = property_info["details"]["permalink"] property_id = property_info["details"]["permalink"]
agents = self.get_agents(property_id) prop_details = self.get_prop_details(property_id)
listing = Property( listing = Property(
mls=mls, mls=mls,
mls_id=( mls_id=(
@@ -165,7 +166,7 @@ class RealtorScraper(Scraper):
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None, longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(property_info, search_type="handle_listing"), address=self._parse_address(property_info, search_type="handle_listing"),
description=Description( description=Description(
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])), alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
style=property_info["basic"].get("type", "").upper(), style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"), beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"), baths_full=property_info["basic"].get("baths_full"),
@@ -176,9 +177,14 @@ class RealtorScraper(Scraper):
year_built=property_info["details"].get("year_built"), year_built=property_info["details"].get("year_built"),
garage=property_info["details"].get("garage"), garage=property_info["details"].get("garage"),
stories=property_info["details"].get("stories"), stories=property_info["details"].get("stories"),
text=property_info.get("description", {}).get("text"),
), ),
days_on_mls=days_on_mls, days_on_mls=days_on_mls,
agents=agents, agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
) )
return [listing] return [listing]
@@ -231,6 +237,7 @@ class RealtorScraper(Scraper):
stories stories
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -272,7 +279,7 @@ class RealtorScraper(Scraper):
}""" }"""
variables = {"property_id": property_id} variables = {"property_id": property_id}
agents = self.get_agents(property_id) prop_details = self.get_prop_details(property_id)
payload = { payload = {
"query": query, "query": query,
@@ -290,7 +297,11 @@ class RealtorScraper(Scraper):
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}", property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
address=self._parse_address(property_info, search_type="handle_address"), address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info), description=self._parse_description(property_info),
agents=agents, agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
) )
] ]
@@ -328,6 +339,7 @@ class RealtorScraper(Scraper):
type type
name name
stories stories
text
} }
source { source {
id id
@@ -342,6 +354,7 @@ class RealtorScraper(Scraper):
street_number street_number
street_name street_name
street_suffix street_suffix
line
unit unit
city city
state_code state_code
@@ -351,10 +364,17 @@ class RealtorScraper(Scraper):
lat lat
} }
} }
county {
name
fips_code
}
neighborhoods { neighborhoods {
name name
} }
} }
tax_record {
public_record_id
}
primary_photo { primary_photo {
href href
} }
@@ -475,7 +495,6 @@ class RealtorScraper(Scraper):
} }
response = self.session.post(self.SEARCH_GQL_URL, json=payload) response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response.raise_for_status()
response_json = response.json() response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search" search_key = "home_search" if "home_search" in query else "property_search"
@@ -510,7 +529,7 @@ class RealtorScraper(Scraper):
return return
property_id = result["property_id"] property_id = result["property_id"]
agents = self.get_agents(property_id) prop_details = self.get_prop_details(property_id)
realty_property = Property( realty_property = Property(
mls=mls, mls=mls,
@@ -534,8 +553,15 @@ class RealtorScraper(Scraper):
longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None, longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(result, search_type="general_search"), address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result), description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result),
county=result["location"]["county"].get("name") if result["location"]["county"] else None,
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result), days_on_mls=self.calculate_days_on_mls(result),
agents=agents, agents=prop_details.get("agents"),
brokers=prop_details.get("brokers"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
) )
return realty_property return realty_property
@@ -630,18 +656,91 @@ class RealtorScraper(Scraper):
return homes return homes
def get_agents(self, property_id: str) -> list[Agent]: def get_prop_details(self, property_id: str) -> dict:
payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n }}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}' if not self.extra_property_data:
response = self.session.post(self.PROPERTY_GQL, data=payload) return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) {
__typename
advertisers {
__typename
type
name
email
phones { number type ext primary }
}
consumer_advertisers {
name
phone
href
type
}
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
estimates {
__typename
currentValues: current_values {
__typename
source { __typename type name }
estimate
estimateHigh: estimate_high
estimateLow: estimate_low
date
isBestHomeValue: isbest_homevalue
}
}
}
}"""
variables = {"property_id": property_id}
response = self.session.post(self.PROPERTY_GQL, json={"query": query, "variables": variables})
data = response.json() data = response.json()
try:
ads = data["data"]["home"]["consumerAdvertisers"]
except (KeyError, TypeError):
return []
agents = [Agent(name=ad["name"], phone=ad["phone"]) for ad in ads] def get_key(keys: list):
return agents try:
value = data
for key in keys:
value = value[key]
return value or {}
except (KeyError, TypeError, IndexError):
return {}
agents = get_key(["data", "home", "advertisers"])
advertisers = get_key(["data", "home", "consumer_advertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
agents = [Agent(
name=ad["name"],
email=ad["email"],
phones=ad["phones"]
) for ad in agents]
brokers = [Broker(
name=ad["name"],
phone=ad["phone"],
website=ad["href"]
) for ad in advertisers if ad.get("type") != "Agent"]
schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
return {
"agents": agents if agents else None,
"brokers": brokers if brokers else None,
"schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None,
}
@staticmethod @staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]: def _parse_neighborhoods(result: dict) -> Optional[str]:
@@ -663,20 +762,22 @@ class RealtorScraper(Scraper):
return address_part return address_part
def _parse_address(self, result: dict, search_type): @staticmethod
def _parse_address(result: dict, search_type):
if search_type == "general_search": if search_type == "general_search":
address = result["location"]["address"] address = result["location"]["address"]
else: else:
address = result["address"] address = result["address"]
return Address( return Address(
full_line=address.get("line"),
street=" ".join( street=" ".join(
[ part for part in [
self.handle_none_safely(address.get("street_number")), address.get("street_number"),
self.handle_none_safely(address.get("street_direction")), address.get("street_direction"),
self.handle_none_safely(address.get("street_name")), address.get("street_name"),
self.handle_none_safely(address.get("street_suffix")), address.get("street_suffix"),
] ] if part is not None
).strip(), ).strip(),
unit=address["unit"], unit=address["unit"],
city=address["city"], city=address["city"],
@@ -704,17 +805,18 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")), alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType(style) if style else None, style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),
sqft=description_data.get("sqft"), sqft=description_data.get("sqft"),
lot_sqft=description_data.get("lot_sqft"), lot_sqft=description_data.get("lot_sqft"),
sold_price=description_data.get("sold_price"), sold_price=description_data.get("sold_price") if result.get('last_sold_date') or result["list_price"] != description_data.get("sold_price") else None, #: has a sold date or list and sold price are different
year_built=description_data.get("year_built"), year_built=description_data.get("year_built"),
garage=description_data.get("garage"), garage=description_data.get("garage"),
stories=description_data.get("stories"), stories=description_data.get("stories"),
text=description_data.get("text"),
) )
@staticmethod @staticmethod

View File

@@ -4,3 +4,11 @@ class InvalidListingType(Exception):
class InvalidDate(Exception): class InvalidDate(Exception):
"""Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23""" """Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23"""
class AuthenticationError(Exception):
"""Raised when there is an issue with the authentication process."""
def __init__(self, *args, response):
super().__init__(*args)
self.response = response

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType from .core.scrapers.models import Property, ListingType, Agent
from .exceptions import InvalidListingType, InvalidDate from .exceptions import InvalidListingType, InvalidDate
ordered_properties = [ ordered_properties = [
@@ -8,7 +9,9 @@ ordered_properties = [
"mls", "mls",
"mls_id", "mls_id",
"status", "status",
"text",
"style", "style",
"full_street_line",
"street", "street",
"unit", "unit",
"city", "city",
@@ -24,16 +27,25 @@ ordered_properties = [
"list_date", "list_date",
"sold_price", "sold_price",
"last_sold_date", "last_sold_date",
"assessed_value",
"estimated_value",
"lot_sqft", "lot_sqft",
"price_per_sqft", "price_per_sqft",
"latitude", "latitude",
"longitude", "longitude",
"neighborhoods",
"county",
"fips_code",
"stories", "stories",
"hoa_fee", "hoa_fee",
"parking_garage", "parking_garage",
"agent", "agent",
"agent_email",
"agent_phones",
"broker", "broker",
"broker_phone", "broker_phone",
"broker_website",
"nearby_schools",
"primary_photo", "primary_photo",
"alt_photos", "alt_photos",
] ]
@@ -45,6 +57,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data: if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city prop_data["city"] = address_data.city
@@ -52,19 +65,27 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["zip_code"] = address_data.zip prop_data["zip_code"] = address_data.zip
if "agents" in prop_data: if "agents" in prop_data:
agents = prop_data["agents"] agents: list[Agent] | None = prop_data["agents"]
if agents: if agents:
prop_data["agent"] = agents[0].name prop_data["agent"] = agents[0].name
if len(agents) > 1: prop_data["agent_email"] = agents[0].email
prop_data["broker"] = agents[1].name prop_data["agent_phones"] = agents[0].phones
prop_data["broker_phone"] = agents[1].phone
if "brokers" in prop_data:
brokers = prop_data["brokers"]
if brokers:
prop_data["broker"] = brokers[0].name
prop_data["broker_phone"] = brokers[0].phone
prop_data["broker_website"] = brokers[0].website
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
description = result.description description = result.description
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style.value prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half
@@ -74,6 +95,7 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["year_built"] = description.year_built prop_data["year_built"] = description.year_built
prop_data["parking_garage"] = description.garage prop_data["parking_garage"] = description.garage
prop_data["stories"] = description.stories prop_data["stories"] = description.stories
prop_data["text"] = description.text
properties_df = pd.DataFrame([prop_data]) properties_df = pd.DataFrame([prop_data])
properties_df = properties_df.reindex(columns=ordered_properties) properties_df = properties_df.reindex(columns=ordered_properties)
@@ -97,5 +119,5 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None:
if date_to_obj < date_from_obj: if date_to_obj < date_from_obj:
raise InvalidDate("date_to must be after date_from.") raise InvalidDate("date_to must be after date_from.")
except ValueError as e: except ValueError:
raise InvalidDate(f"Invalid date format or range") raise InvalidDate(f"Invalid date format or range")

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.17" version = "0.3.28"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -10,7 +10,7 @@ readme = "README.md"
homeharvest = "homeharvest.cli:main" homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.10,<3.13" python = ">=3.9,<3.13"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.1" pandas = "^2.1.1"

View File

@@ -142,3 +142,26 @@ def test_realtor_foreclosed():
def test_realtor_agent(): def test_realtor_agent():
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale") scraped = scrape_property(location="Detroit, MI", listing_type="for_sale")
assert scraped["agent"].nunique() > 1 assert scraped["agent"].nunique() > 1
def test_realtor_without_extra_details():
results = [
scrape_property(
location="15509 N 172nd Dr, Surprise, AZ 85388",
extra_property_data=False,
),
scrape_property(
location="15509 N 172nd Dr, Surprise, AZ 85388",
),
]
assert not results[0].equals(results[1])
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0