enh: property type (#102)

master
Cullen Watson 2024-11-03 17:23:07 -06:00 committed by GitHub
parent 1f717bd9e3
commit 8e04f6b117
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 274 additions and 241 deletions

View File

@ -68,13 +68,24 @@ print(properties.head())
``` ```
Required Required
├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc. ├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.
── listing_type (option): Choose the type of listing. ── listing_type (option): Choose the type of listing.
- 'for_rent' - 'for_rent'
- 'for_sale' - 'for_sale'
- 'sold' - 'sold'
- 'pending' - 'pending' (for pending/contingent sales)
Optional Optional
├── property_type (list): Choose the type of properties.
- 'single_family'
- 'multi_family'
- 'condos'
- 'condo_townhome_rowhome_coop'
- 'condo_townhome'
- 'townhomes'
- 'duplex_triplex'
- 'farm'
- 'land'
- 'mobile'
├── radius (decimal): Radius in miles to find comparable properties based on individual addresses. ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
│ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored) │ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
@ -94,7 +105,7 @@ Optional
├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data for general searches (e.g. schools, tax appraisals etc.) ├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data for general searches (e.g. schools, tax appraisals etc.)
├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' ├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending'
└── limit (integer): Limit the number of properties to fetch. Max & default is 10000. └── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
``` ```

View File

@ -1,141 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "cb48903e-5021-49fe-9688-45cd0bc05d0f",
"metadata": {
"is_executing": true
},
"outputs": [],
"source": [
"from homeharvest import scrape_property\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "156488ce-0d5f-43c5-87f4-c33e9c427860",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None) # Show all columns\n",
"pd.set_option('display.max_rows', None) # Show all rows\n",
"pd.set_option('display.width', None) # Auto-adjust display width to fit console\n",
"pd.set_option('display.max_colwidth', 50) # Limit max column width to 50 characters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c8b9744-8606-4e9b-8add-b90371a249a7",
"metadata": {},
"outputs": [],
"source": [
"# check for sale properties\n",
"scrape_property(\n",
" location=\"dallas\",\n",
" listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aaf86093",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# search a specific address\n",
"scrape_property(\n",
" location=\"2530 Al Lipscomb Way\",\n",
" listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab7b4c21-da1d-4713-9df4-d7425d8ce21e",
"metadata": {},
"outputs": [],
"source": [
"# check rentals\n",
"scrape_property(\n",
" location=\"chicago, illinois\",\n",
" listing_type=\"for_rent\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af280cd3",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# check sold properties\n",
"properties = scrape_property(\n",
" location=\"90210\",\n",
" listing_type=\"sold\",\n",
" past_days=10\n",
")\n",
"display(properties)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "628c1ce2",
"metadata": {
"collapsed": false,
"is_executing": true,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# display clickable URLs\n",
"from IPython.display import display, HTML\n",
"properties['property_url'] = '<a href=\"' + properties['property_url'] + '\" target=\"_blank\">' + properties['property_url'] + '</a>'\n",
"\n",
"html = properties.to_html(escape=False)\n",
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
"display(HTML(truncate_width))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,20 +0,0 @@
from homeharvest import scrape_property
from datetime import datetime
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
# pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
# mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
)
print(f"Number of properties: {len(properties)}")
# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())

104
examples/price_of_land.py Normal file
View File

@ -0,0 +1,104 @@
"""
This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files.
It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code.
"""
import os
import pandas as pd
from homeharvest import scrape_property
def get_property_details(zip: str, listing_type):
properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365)
if not properties.empty:
properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None)
properties = properties[properties["sqft"].isnull()]
properties["ppa"] = properties.apply(
lambda row: (
int(
(
row["sold_price"]
if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD")
else row["list_price"]
)
/ row["lot_acres"]
)
if pd.notnull(row["lot_acres"])
and row["lot_acres"] > 0
and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"]))
else None
),
axis=1,
)
properties["ppa"] = properties["ppa"].astype("Int64")
selected_columns = [
"property_url",
"property_id",
"style",
"status",
"street",
"city",
"state",
"zip_code",
"county",
"list_date",
"last_sold_date",
"list_price",
"sold_price",
"lot_sqft",
"lot_acres",
"ppa",
]
properties = properties[selected_columns]
return properties
def output_to_excel(zip_code, sold_df, pending_df):
root_folder = os.getcwd()
zip_folder = os.path.join(root_folder, "zips", zip_code)
# Create zip code folder if it doesn't exist
os.makedirs(zip_folder, exist_ok=True)
# Define file paths
sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx")
pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx")
# Save individual sold and pending files
sold_df.to_excel(sold_file, index=False)
pending_df.to_excel(pending_file, index=False)
zip_codes = map(
str,
[
22920,
77024,
78028,
24553,
22967,
22971,
22922,
22958,
22969,
22949,
22938,
24599,
24562,
22976,
24464,
22964,
24581,
],
)
combined_df = pd.DataFrame()
for zip in zip_codes:
sold_df = get_property_details(zip, "sold")
pending_df = get_property_details(zip, "pending")
combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True)
output_to_excel(zip, sold_df, pending_df)
combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx")
combined_df.to_excel(combined_file, index=False)

View File

@ -3,12 +3,13 @@ import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType from .core.scrapers.models import ListingType, SearchPropertyType
def scrape_property( def scrape_property(
location: str, location: str,
listing_type: str = "for_sale", listing_type: str = "for_sale",
property_type: list[str] | None = None,
radius: float = None, radius: float = None,
mls_only: bool = False, mls_only: bool = False,
past_days: int = None, past_days: int = None,
@ -24,6 +25,7 @@ def scrape_property(
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold, pending) :param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
:param proxy: Proxy to use for scraping :param proxy: Proxy to use for scraping
@ -41,6 +43,7 @@ def scrape_property(
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType[listing_type.upper()], listing_type=ListingType[listing_type.upper()],
property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
proxy=proxy, proxy=proxy,
radius=radius, radius=radius,
mls_only=mls_only, mls_only=mls_only,
@ -63,4 +66,6 @@ def scrape_property(
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA}) return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace(
{"None": pd.NA, None: pd.NA, "": pd.NA}
)

View File

@ -5,7 +5,7 @@ from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName, SearchPropertyType
import json import json
@ -13,6 +13,7 @@ import json
class ScraperInput: class ScraperInput:
location: str location: str
listing_type: ListingType listing_type: ListingType
property_type: list[SearchPropertyType] | None = None
radius: float | None = None radius: float | None = None
mls_only: bool | None = False mls_only: bool | None = False
proxy: str | None = None proxy: str | None = None
@ -34,11 +35,12 @@ class Scraper:
): ):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.listing_type
self.property_type = scraper_input.property_type
if not self.session: if not self.session:
Scraper.session = requests.Session() Scraper.session = requests.Session()
retries = Retry( retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"]) total=3, backoff_factor=4, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
) )
adapter = HTTPAdapter(max_retries=retries) adapter = HTTPAdapter(max_retries=retries)
@ -46,21 +48,21 @@ class Scraper:
Scraper.session.mount("https://", adapter) Scraper.session.mount("https://", adapter)
Scraper.session.headers.update( Scraper.session.headers.update(
{ {
'accept': 'application/json, text/javascript', "accept": "application/json, text/javascript",
'accept-language': 'en-US,en;q=0.9', "accept-language": "en-US,en;q=0.9",
'cache-control': 'no-cache', "cache-control": "no-cache",
'content-type': 'application/json', "content-type": "application/json",
'origin': 'https://www.realtor.com', "origin": "https://www.realtor.com",
'pragma': 'no-cache', "pragma": "no-cache",
'priority': 'u=1, i', "priority": "u=1, i",
'rdc-ab-tests': 'commute_travel_time_variation:v1', "rdc-ab-tests": "commute_travel_time_variation:v1",
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"', "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0', "sec-ch-ua-mobile": "?0",
'sec-ch-ua-platform': '"Windows"', "sec-ch-ua-platform": '"Windows"',
'sec-fetch-dest': 'empty', "sec-fetch-dest": "empty",
'sec-fetch-mode': 'cors', "sec-fetch-mode": "cors",
'sec-fetch-site': 'same-origin', "sec-fetch-site": "same-origin",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
} }
) )
@ -94,27 +96,29 @@ class Scraper:
response = requests.post( response = requests.post(
"https://graph.realtor.com/auth/token", "https://graph.realtor.com/auth/token",
headers={ headers={
'Host': 'graph.realtor.com', "Host": "graph.realtor.com",
'Accept': '*/*', "Accept": "*/*",
'Content-Type': 'Application/json', "Content-Type": "Application/json",
'X-Client-ID': 'rdc_mobile_native,iphone', "X-Client-ID": "rdc_mobile_native,iphone",
'X-Visitor-ID': device_id, "X-Visitor-ID": device_id,
'X-Client-Version': '24.21.23.679885', "X-Client-Version": "24.21.23.679885",
'Accept-Language': 'en-US,en;q=0.9', "Accept-Language": "en-US,en;q=0.9",
'User-Agent': 'Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0', "User-Agent": "Realtor.com/24.21.23.679885 CFNetwork/1494.0.7 Darwin/23.4.0",
}, },
data=json.dumps({ data=json.dumps(
"grant_type": "device_mobile", {
"device_id": device_id, "grant_type": "device_mobile",
"client_app_id": "rdc_mobile_native,24.21.23.679885,iphone" "device_id": device_id,
})) "client_app_id": "rdc_mobile_native,24.21.23.679885,iphone",
}
),
)
data = response.json() data = response.json()
if not (access_token := data.get("access_token")): if not (access_token := data.get("access_token")):
raise AuthenticationError( raise AuthenticationError(
"Failed to get access token, use a proxy/vpn or wait a moment and try again.", "Failed to get access token, use a proxy/vpn or wait a moment and try again.", response=response
response=response
) )
return access_token return access_token

View File

@ -17,6 +17,19 @@ class SiteName(Enum):
raise ValueError(f"{value} not found in {cls}") raise ValueError(f"{value} not found in {cls}")
class SearchPropertyType(Enum):
SINGLE_FAMILY = "single_family"
CONDOS = "condos"
CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop"
CONDO_TOWNHOME = "condo_townhome"
TOWNHOMES = "townhomes"
DUPLEX_TRIPLEX = "duplex_triplex"
FARM = "farm"
LAND = "land"
MULTI_FAMILY = "multi_family"
MOBILE = "mobile"
class ListingType(Enum): class ListingType(Enum):
FOR_SALE = "FOR_SALE" FOR_SALE = "FOR_SALE"
FOR_RENT = "FOR_RENT" FOR_RENT = "FOR_RENT"

View File

@ -6,12 +6,28 @@ This module implements the scraper for realtor.com
""" """
from __future__ import annotations from __future__ import annotations
import json
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from json import JSONDecodeError
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt
from .. import Scraper from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker, Builder, Advertisers, Office from ..models import (
Property,
Address,
ListingType,
Description,
PropertyType,
Agent,
Broker,
Builder,
Advertisers,
Office,
)
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA
@ -81,9 +97,12 @@ class RealtorScraper(Scraper):
return property_info["listings"][0]["listing_id"] return property_info["listings"][0]["listing_id"]
def handle_home(self, property_id: str) -> list[Property]: def handle_home(self, property_id: str) -> list[Property]:
query = """query Home($property_id: ID!) { query = (
"""query Home($property_id: ID!) {
home(property_id: $property_id) %s home(property_id: $property_id) %s
}""" % HOMES_DATA }"""
% HOMES_DATA
)
variables = {"property_id": property_id} variables = {"property_id": property_id}
payload = { payload = {
@ -96,9 +115,7 @@ class RealtorScraper(Scraper):
property_info = response_json["data"]["home"] property_info = response_json["data"]["home"]
return [ return [self.process_property(property_info, "home")]
self.process_property(property_info, "home")
]
@staticmethod @staticmethod
def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None: def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None:
@ -122,7 +139,7 @@ class RealtorScraper(Scraper):
phones=advertiser.get("phones"), phones=advertiser.get("phones"),
) )
if advertiser.get('broker') and advertiser["broker"].get('name'): #: has a broker if advertiser.get("broker") and advertiser["broker"].get("name"): #: has a broker
processed_advertisers.broker = Broker( processed_advertisers.broker = Broker(
uuid=_parse_fulfillment_id(advertiser["broker"].get("fulfillment_id")), uuid=_parse_fulfillment_id(advertiser["broker"].get("fulfillment_id")),
name=advertiser["broker"].get("name"), name=advertiser["broker"].get("name"),
@ -153,15 +170,16 @@ class RealtorScraper(Scraper):
return return
able_to_get_lat_long = ( able_to_get_lat_long = (
result result
and result.get("location") and result.get("location")
and result["location"].get("address") and result["location"].get("address")
and result["location"]["address"].get("coordinate") and result["location"]["address"].get("coordinate")
) )
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") is_pending = result["flags"].get("is_pending")
is_contingent = result["flags"].get("is_contingent")
if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING): if (is_pending or is_contingent) and (self.exclude_pending and self.listing_type != ListingType.PENDING):
return return
property_id = result["property_id"] property_id = result["property_id"]
@ -184,7 +202,7 @@ class RealtorScraper(Scraper):
property_url=result["href"], property_url=result["href"],
property_id=property_id, property_id=property_id,
listing_id=result.get("listing_id"), listing_id=result.get("listing_id"),
status="PENDING" if is_pending else result["status"].upper(), status="PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper(),
list_price=result["list_price"], list_price=result["list_price"],
list_price_min=result["list_price_min"], list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"], list_price_max=result["list_price_max"],
@ -225,6 +243,11 @@ class RealtorScraper(Scraper):
elif self.last_x_days: elif self.last_x_days:
date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}' date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}'
property_type_param = ""
if self.property_type:
property_types = [pt.value for pt in self.property_type]
property_type_param = f"type: {json.dumps(property_types)}"
sort_param = ( sort_param = (
"sort: [{ field: sold_date, direction: desc }]" "sort: [{ field: sold_date, direction: desc }]"
if self.listing_type == ListingType.SOLD if self.listing_type == ListingType.SOLD
@ -259,6 +282,7 @@ class RealtorScraper(Scraper):
status: %s status: %s
%s %s
%s %s
%s
} }
%s %s
limit: 200 limit: 200
@ -268,6 +292,7 @@ class RealtorScraper(Scraper):
is_foreclosure, is_foreclosure,
listing_type.value.lower(), listing_type.value.lower(),
date_param, date_param,
property_type_param,
pending_or_contingent_param, pending_or_contingent_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
@ -290,6 +315,7 @@ class RealtorScraper(Scraper):
status: %s status: %s
%s %s
%s %s
%s
} }
%s %s
limit: 200 limit: 200
@ -299,13 +325,14 @@ class RealtorScraper(Scraper):
is_foreclosure, is_foreclosure,
listing_type.value.lower(), listing_type.value.lower(),
date_param, date_param,
property_type_param,
pending_or_contingent_param, pending_or_contingent_param,
sort_param, sort_param,
GENERAL_RESULTS_QUERY, GENERAL_RESULTS_QUERY,
) )
else: #: general search, came from an address else: #: general search, came from an address
query = ( query = (
"""query Property_search( """query Property_search(
$property_id: [ID]! $property_id: [ID]!
$offset: Int!, $offset: Int!,
) { ) {
@ -317,7 +344,7 @@ class RealtorScraper(Scraper):
offset: $offset offset: $offset
) %s ) %s
}""" }"""
% GENERAL_RESULTS_QUERY % GENERAL_RESULTS_QUERY
) )
payload = { payload = {
@ -332,12 +359,12 @@ class RealtorScraper(Scraper):
properties: list[Property] = [] properties: list[Property] = []
if ( if (
response_json is None response_json is None
or "data" not in response_json or "data" not in response_json
or response_json["data"] is None or response_json["data"] is None
or search_key not in response_json["data"] or search_key not in response_json["data"]
or response_json["data"][search_key] is None or response_json["data"][search_key] is None
or "results" not in response_json["data"][search_key] or "results" not in response_json["data"][search_key]
): ):
return {"total": 0, "properties": []} return {"total": 0, "properties": []}
@ -347,12 +374,10 @@ class RealtorScraper(Scraper):
#: limit the number of properties to be processed #: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50 #: example, if your offset is 200, and your limit is 250, return 50
properties_list = properties_list[:self.limit - offset] properties_list = properties_list[: self.limit - offset]
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [ futures = [executor.submit(self.process_property, result, search_key) for result in properties_list]
executor.submit(self.process_property, result, search_key) for result in properties_list
]
for future in as_completed(futures): for future in as_completed(futures):
result = future.result() result = future.result()
@ -451,6 +476,9 @@ class RealtorScraper(Scraper):
"assessed_value": assessed_value if assessed_value else None, "assessed_value": assessed_value if assessed_value else None,
} }
@retry(
retry=retry_if_exception_type(JSONDecodeError), wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3)
)
def get_prop_details(self, property_id: str) -> dict: def get_prop_details(self, property_id: str) -> dict:
if not self.extra_property_data: if not self.extra_property_data:
return {} return {}
@ -534,7 +562,9 @@ class RealtorScraper(Scraper):
style = style.upper() style = style.upper()
primary_photo = "" primary_photo = ""
if (primary_photo_info := result.get('primary_photo')) and (primary_photo_href := primary_photo_info.get("href")): if (primary_photo_info := result.get("primary_photo")) and (
primary_photo_href := primary_photo_info.get("href")
):
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
return Description( return Description(
@ -547,7 +577,7 @@ class RealtorScraper(Scraper):
sqft=description_data.get("sqft"), sqft=description_data.get("sqft"),
lot_sqft=description_data.get("lot_sqft"), lot_sqft=description_data.get("lot_sqft"),
sold_price=( sold_price=(
result.get('last_sold_price') or description_data.get("sold_price") result.get("last_sold_price") or description_data.get("sold_price")
if result.get("last_sold_date") or result["list_price"] != description_data.get("sold_price") if result.get("last_sold_date") or result["list_price"] != description_data.get("sold_price")
else None else None
), #: has a sold date or list and sold price are different ), #: has a sold date or list and sold price are different
@ -581,4 +611,8 @@ class RealtorScraper(Scraper):
if not photos_info: if not photos_info:
return None return None
return [photo_info["href"].replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") for photo_info in photos_info if photo_info.get("href")] return [
photo_info["href"].replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
for photo_info in photos_info
if photo_info.get("href")
]

19
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]] [[package]]
name = "annotated-types" name = "annotated-types"
@ -667,6 +667,21 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
] ]
[[package]]
name = "tenacity"
version = "9.0.0"
description = "Retry code until it succeeds"
optional = false
python-versions = ">=3.8"
files = [
{file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
{file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
]
[package.extras]
doc = ["reno", "sphinx"]
test = ["pytest", "tornado (>=4.5)", "typeguard"]
[[package]] [[package]]
name = "tomli" name = "tomli"
version = "2.0.1" version = "2.0.1"
@ -740,4 +755,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.9,<3.13" python-versions = ">=3.9,<3.13"
content-hash = "21ef9cfb35c446a375a2b74c37691d7031afb1e4f66a8b63cb7c1669470689d2" content-hash = "cefc11b1bf5ad99d628f6d08f6f03003522cc1b6e48b519230d99d716a5c165c"

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.4.3" version = "0.4.4"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
@ -14,6 +14,7 @@ python = ">=3.9,<3.13"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.1" pandas = "^2.1.1"
pydantic = "^2.7.4" pydantic = "^2.7.4"
tenacity = "^9.0.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@ -105,8 +105,12 @@ def test_realtor():
location="2530 Al Lipscomb Way", location="2530 Al Lipscomb Way",
listing_type="for_sale", listing_type="for_sale",
), ),
scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format scrape_property(
scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format location="Phoenix, AZ", listing_type="for_rent", limit=1000
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", listing_type="sold", limit=1000
), #: does not support "city, state, USA" format
scrape_property(location="85281"), scrape_property(location="85281"),
] ]
@ -114,11 +118,13 @@ def test_realtor():
def test_realtor_city(): def test_realtor_city():
results = scrape_property( results = scrape_property(location="Atlanta, GA", listing_type="for_sale", limit=1000)
location="Atlanta, GA",
listing_type="for_sale", assert results is not None and len(results) > 0
limit=1000
)
def test_realtor_land():
results = scrape_property(location="Atlanta, GA", listing_type="for_sale", property_type=["land"], limit=1000)
assert results is not None and len(results) > 0 assert results is not None and len(results) > 0
@ -241,9 +247,10 @@ def test_apartment_list_price():
results = results[results["style"] == "APARTMENT"] results = results[results["style"] == "APARTMENT"]
#: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max #: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max
assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len( assert (
results len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(results)
) > 0.5 > 0.5
)
def test_builder_exists(): def test_builder_exists():