Compare commits

..

No commits in common. "master" and "v0.4.4" have entirely different histories.

10 changed files with 86 additions and 267 deletions

1
.github/FUNDING.yml vendored
View File

@ -1 +0,0 @@
github: Bunsly

View File

@ -2,6 +2,10 @@
**HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings. **HomeHarvest** is a real estate scraping library that extracts and formats data in the style of MLS listings.
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com)** *to work with us.*
## HomeHarvest Features ## HomeHarvest Features
- **Source**: Fetches properties directly from **Realtor.com**. - **Source**: Fetches properties directly from **Realtor.com**.
@ -34,9 +38,9 @@ filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property( properties = scrape_property(
location="San Diego, CA", location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent, pending) listing_type="sold", # or (for_sale, for_rent, pending)
property_type='single_family',
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent) past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
# property_type=['single_family','multi_family'],
# date_from="2023-05-01", # alternative to past_days # date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28", # date_to="2023-05-28",
# foreclosure=True # foreclosure=True
@ -83,12 +87,7 @@ Optional
- 'farm' - 'farm'
- 'land' - 'land'
- 'mobile' - 'mobile'
├── return_type (option): Choose the return type.
│ - 'pandas' (default)
│ - 'pydantic'
│ - 'raw' (json)
├── radius (decimal): Radius in miles to find comparable properties based on individual addresses. ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
│ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored) │ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
@ -155,14 +154,6 @@ Property
│ ├── new_construction │ ├── new_construction
│ └── hoa_fee │ └── hoa_fee
├── Tax Information:
│ ├── year
│ ├── tax
│ ├── assessment
│ │ ├── building
│ │ ├── land
│ │ └── total
├── Location Details: ├── Location Details:
│ ├── latitude │ ├── latitude
│ ├── longitude │ ├── longitude

View File

@ -3,13 +3,12 @@ import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType, SearchPropertyType, ReturnType, Property from .core.scrapers.models import ListingType, SearchPropertyType
def scrape_property( def scrape_property(
location: str, location: str,
listing_type: str = "for_sale", listing_type: str = "for_sale",
return_type: str = "pandas",
property_type: list[str] | None = None, property_type: list[str] | None = None,
radius: float = None, radius: float = None,
mls_only: bool = False, mls_only: bool = False,
@ -20,13 +19,12 @@ def scrape_property(
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False, exclude_pending: bool = False,
limit: int = 10000 limit: int = 10000,
) -> pd.DataFrame | list[dict] | list[Property]: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold, pending) :param listing_type: Listing Type (for_sale, for_rent, sold, pending)
:param return_type: Return type (pandas, pydantic, raw)
:param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile) :param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
@ -44,8 +42,7 @@ def scrape_property(
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType(listing_type.upper()), listing_type=ListingType[listing_type.upper()],
return_type=ReturnType(return_type.lower()),
property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None, property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
proxy=proxy, proxy=proxy,
radius=radius, radius=radius,
@ -62,9 +59,6 @@ def scrape_property(
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
results = site.search() results = site.search()
if scraper_input.return_type != ReturnType.pandas:
return results
properties_dfs = [df for result in results if not (df := process_result(result)).empty] properties_dfs = [df for result in results if not (df := process_result(result)).empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()

View File

@ -1,13 +1,11 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from typing import Union
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
import uuid import uuid
from ...exceptions import AuthenticationError from ...exceptions import AuthenticationError
from .models import Property, ListingType, SiteName, SearchPropertyType, ReturnType from .models import Property, ListingType, SiteName, SearchPropertyType
import json import json
@ -26,7 +24,6 @@ class ScraperInput:
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False exclude_pending: bool | None = False
limit: int = 10000 limit: int = 10000
return_type: ReturnType = ReturnType.pandas
class Scraper: class Scraper:
@ -84,9 +81,8 @@ class Scraper:
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit self.limit = scraper_input.limit
self.return_type = scraper_input.return_type
def search(self) -> list[Union[Property | dict]]: ... def search(self) -> list[Property]: ...
@staticmethod @staticmethod
def _parse_home(home) -> Property: ... def _parse_home(home) -> Property: ...

View File

@ -4,12 +4,6 @@ from enum import Enum
from typing import Optional from typing import Optional
class ReturnType(Enum):
pydantic = "pydantic"
pandas = "pandas"
raw = "raw"
class SiteName(Enum): class SiteName(Enum):
ZILLOW = "zillow" ZILLOW = "zillow"
REDFIN = "redfin" REDFIN = "redfin"
@ -25,7 +19,6 @@ class SiteName(Enum):
class SearchPropertyType(Enum): class SearchPropertyType(Enum):
SINGLE_FAMILY = "single_family" SINGLE_FAMILY = "single_family"
APARTMENT = "apartment"
CONDOS = "condos" CONDOS = "condos"
CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop" CONDO_TOWNHOME_ROWHOME_COOP = "condo_townhome_rowhome_coop"
CONDO_TOWNHOME = "condo_townhome" CONDO_TOWNHOME = "condo_townhome"
@ -155,9 +148,6 @@ class Property:
property_url: str property_url: str
property_id: str property_id: str
#: allows_cats: bool
#: allows_dogs: bool
listing_id: str | None = None listing_id: str | None = None
mls: str | None = None mls: str | None = None
@ -177,8 +167,6 @@ class Property:
hoa_fee: int | None = None hoa_fee: int | None = None
days_on_mls: int | None = None days_on_mls: int | None = None
description: Description | None = None description: Description | None = None
tags: list[str] | None = None
details: list[dict] | None = None
latitude: float | None = None latitude: float | None = None
longitude: float | None = None longitude: float | None = None
@ -188,7 +176,5 @@ class Property:
nearby_schools: list[str] = None nearby_schools: list[str] = None
assessed_value: int | None = None assessed_value: int | None = None
estimated_value: int | None = None estimated_value: int | None = None
tax: int | None = None
tax_history: list[dict] | None = None
advertisers: Advertisers | None = None advertisers: Advertisers | None = None

View File

@ -13,12 +13,7 @@ from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from tenacity import ( from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt
retry,
retry_if_exception_type,
wait_exponential,
stop_after_attempt,
)
from .. import Scraper from .. import Scraper
from ..models import ( from ..models import (
@ -32,9 +27,8 @@ from ..models import (
Builder, Builder,
Advertisers, Advertisers,
Office, Office,
ReturnType
) )
from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA, HOME_FRAGMENT from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
@ -121,7 +115,7 @@ class RealtorScraper(Scraper):
property_info = response_json["data"]["home"] property_info = response_json["data"]["home"]
return [self.process_property(property_info)] return [self.process_property(property_info, "home")]
@staticmethod @staticmethod
def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None: def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None:
@ -169,7 +163,7 @@ class RealtorScraper(Scraper):
return processed_advertisers return processed_advertisers
def process_property(self, result: dict) -> Property | None: def process_property(self, result: dict, query_name: str) -> Property | None:
mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None
if not mls and self.mls_only: if not mls and self.mls_only:
@ -189,7 +183,9 @@ class RealtorScraper(Scraper):
return return
property_id = result["property_id"] property_id = result["property_id"]
prop_details = self.process_extra_property_details(result) if self.extra_property_data else {} prop_details = self.get_prop_details(property_id) if self.extra_property_data and query_name != "home" else {}
if not prop_details:
prop_details = self.process_extra_property_details(result)
property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues") property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues")
estimated_value = self.get_key(property_estimates_root, [0, "estimate"]) estimated_value = self.get_key(property_estimates_root, [0, "estimate"])
@ -206,33 +202,31 @@ class RealtorScraper(Scraper):
property_url=result["href"], property_url=result["href"],
property_id=property_id, property_id=property_id,
listing_id=result.get("listing_id"), listing_id=result.get("listing_id"),
status=("PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper()), status="PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper(),
list_price=result["list_price"], list_price=result["list_price"],
list_price_min=result["list_price_min"], list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"], list_price_max=result["list_price_max"],
list_date=(result["list_date"].split("T")[0] if result.get("list_date") else None), list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
prc_sqft=result.get("price_per_sqft"), prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"), last_sold_date=result.get("last_sold_date"),
new_construction=result["flags"].get("is_new_construction") is True, new_construction=result["flags"].get("is_new_construction") is True,
hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None), hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None,
latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None), latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None,
longitude=(result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None), longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(result, search_type="general_search"), address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result), description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result), neighborhoods=self._parse_neighborhoods(result),
county=(result["location"]["county"].get("name") if result["location"]["county"] else None), county=result["location"]["county"].get("name") if result["location"]["county"] else None,
fips_code=(result["location"]["county"].get("fips_code") if result["location"]["county"] else None), fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result), days_on_mls=self.calculate_days_on_mls(result),
nearby_schools=prop_details.get("schools"), nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"), assessed_value=prop_details.get("assessed_value"),
estimated_value=estimated_value if estimated_value else None, estimated_value=estimated_value if estimated_value else None,
advertisers=advertisers, advertisers=advertisers,
tax=prop_details.get("tax"),
tax_history=prop_details.get("tax_history"),
) )
return realty_property return realty_property
def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, Union[list[Property], list[dict]]]]: def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]:
""" """
Handles a location area & returns a list of properties Handles a location area & returns a list of properties
""" """
@ -257,7 +251,7 @@ class RealtorScraper(Scraper):
sort_param = ( sort_param = (
"sort: [{ field: sold_date, direction: desc }]" "sort: [{ field: sold_date, direction: desc }]"
if self.listing_type == ListingType.SOLD if self.listing_type == ListingType.SOLD
else "" #: "sort: [{ field: list_date, direction: desc }]" #: prioritize normal fractal sort from realtor else "sort: [{ field: list_date, direction: desc }]"
) )
pending_or_contingent_param = ( pending_or_contingent_param = (
@ -305,20 +299,24 @@ class RealtorScraper(Scraper):
) )
elif search_type == "area": #: general search, came from a general location elif search_type == "area": #: general search, came from a general location
query = """query Home_search( query = """query Home_search(
$location: String!, $city: String,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int, $offset: Int,
) { ) {
home_search( home_search(
query: { query: {
%s %s
search_location: {location: $location} city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s status: %s
unique: true
%s %s
%s %s
%s %s
} }
bucket: { sort: "fractal_v1.1.3_fr" }
%s %s
limit: 200 limit: 200
offset: $offset offset: $offset
@ -358,7 +356,7 @@ class RealtorScraper(Scraper):
response_json = response.json() response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search" search_key = "home_search" if "home_search" in query else "property_search"
properties: list[Union[Property, dict]] = [] properties: list[Property] = []
if ( if (
response_json is None response_json is None
@ -376,25 +374,15 @@ class RealtorScraper(Scraper):
#: limit the number of properties to be processed #: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50 #: example, if your offset is 200, and your limit is 250, return 50
properties_list: list[dict] = properties_list[: self.limit - offset] properties_list = properties_list[: self.limit - offset]
if self.extra_property_data: with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
property_ids = [data["property_id"] for data in properties_list] futures = [executor.submit(self.process_property, result, search_key) for result in properties_list]
extra_property_details = self.get_bulk_prop_details(property_ids) or {}
for result in properties_list: for future in as_completed(futures):
result.update(extra_property_details.get(result["property_id"], {})) result = future.result()
if result:
if self.return_type != ReturnType.raw: properties.append(result)
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [executor.submit(self.process_property, result) for result in properties_list]
for future in as_completed(futures):
result = future.result()
if result:
properties.append(result)
else:
properties = properties_list
return { return {
"total": total_properties, "total": total_properties,
@ -439,7 +427,10 @@ class RealtorScraper(Scraper):
else: #: general search, location else: #: general search, location
search_variables |= { search_variables |= {
"location": self.location, "city": location_info.get("city"),
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
} }
if self.foreclosure: if self.foreclosure:
@ -456,11 +447,7 @@ class RealtorScraper(Scraper):
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
search_type=search_type, search_type=search_type,
) )
for i in range( for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE)
self.DEFAULT_PAGE_SIZE,
min(total, self.limit),
self.DEFAULT_PAGE_SIZE,
)
] ]
for future in as_completed(futures): for future in as_completed(futures):
@ -482,75 +469,38 @@ class RealtorScraper(Scraper):
def process_extra_property_details(self, result: dict) -> dict: def process_extra_property_details(self, result: dict) -> dict:
schools = self.get_key(result, ["nearbySchools", "schools"]) schools = self.get_key(result, ["nearbySchools", "schools"])
assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"]) assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"])
tax_history = self.get_key(result, ["taxHistory"])
schools = [school["district"]["name"] for school in schools if school["district"].get("name")] schools = [school["district"]["name"] for school in schools if school["district"].get("name")]
# Process tax history
latest_tax = None
processed_tax_history = None
if tax_history and isinstance(tax_history, list):
tax_history = sorted(tax_history, key=lambda x: x.get("year", 0), reverse=True)
if tax_history and "tax" in tax_history[0]:
latest_tax = tax_history[0]["tax"]
processed_tax_history = []
for entry in tax_history:
if "year" in entry and "tax" in entry:
processed_entry = {
"year": entry["year"],
"tax": entry["tax"],
}
if "assessment" in entry and isinstance(entry["assessment"], dict):
processed_entry["assessment"] = {
"building": entry["assessment"].get("building"),
"land": entry["assessment"].get("land"),
"total": entry["assessment"].get("total"),
}
processed_tax_history.append(processed_entry)
return { return {
"schools": schools if schools else None, "schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None, "assessed_value": assessed_value if assessed_value else None,
"tax": latest_tax,
"tax_history": processed_tax_history,
} }
@retry( @retry(
retry=retry_if_exception_type(JSONDecodeError), retry=retry_if_exception_type(JSONDecodeError), wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3)
wait=wait_exponential(min=4, max=10),
stop=stop_after_attempt(3),
) )
def get_bulk_prop_details(self, property_ids: list[str]) -> dict: def get_prop_details(self, property_id: str) -> dict:
""" if not self.extra_property_data:
Fetch extra property details for multiple properties in a single GraphQL query.
Returns a map of property_id to its details.
"""
if not self.extra_property_data or not property_ids:
return {} return {}
property_ids = list(set(property_ids)) query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) {
__typename
# Construct the bulk query nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
fragments = "\n".join( __typename schools { district { __typename id name } }
f'home_{property_id}: home(property_id: {property_id}) {{ ...HomeData }}' }
for property_id in property_ids taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
) }
query = f"""{HOME_FRAGMENT} }"""
query GetHomes {{ variables = {"property_id": property_id}
{fragments} response = self.session.post(self.SEARCH_GQL_URL, json={"query": query, "variables": variables})
}}"""
response = self.session.post(self.SEARCH_GQL_URL, json={"query": query})
data = response.json() data = response.json()
property_details = data["data"]["home"]
if "data" not in data: return self.process_extra_property_details(property_details)
return {}
properties = data["data"]
return {data.replace('home_', ''): properties[data] for data in properties if properties[data]}
@staticmethod @staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]: def _parse_neighborhoods(result: dict) -> Optional[str]:
@ -620,7 +570,7 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])), alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=(PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None), style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),

View File

@ -11,34 +11,6 @@ _SEARCH_HOMES_DATA_BASE = """{
list_price_max list_price_max
list_price_min list_price_min
price_per_sqft price_per_sqft
tags
details {
category
text
parent_category
}
pet_policy {
cats
dogs
dogs_small
dogs_large
__typename
}
units {
availability {
date
__typename
}
description {
baths_consolidated
baths
beds
sqft
__typename
}
list_price
__typename
}
flags { flags {
is_contingent is_contingent
is_pending is_pending
@ -92,14 +64,11 @@ _SEARCH_HOMES_DATA_BASE = """{
tax_record { tax_record {
public_record_id public_record_id
} }
primary_photo(https: true) { primary_photo {
href href
} }
photos(https: true) { photos {
href href
tags {
label
}
} }
advertisers { advertisers {
email email
@ -147,63 +116,15 @@ _SEARCH_HOMES_DATA_BASE = """{
} }
rental_management { rental_management {
name name
href
fulfillment_id fulfillment_id
} }
} }
""" """
HOME_FRAGMENT = """
fragment HomeData on Home {
property_id
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } }
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
}
"""
HOMES_DATA = """%s HOMES_DATA = """%s
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
__typename schools { district { __typename id name } } __typename schools { district { __typename id name } }
} }
monthly_fees {
description
display_amount
}
one_time_fees {
description
display_amount
}
parking {
unassigned_space_rent
assigned_spaces_available
description
assigned_space_rent
}
terms {
text
category
}
taxHistory: tax_history { __typename tax year assessment { __typename building land total } } taxHistory: tax_history { __typename tax year assessment { __typename building land total } }
estimates { estimates {
__typename __typename
@ -220,19 +141,19 @@ HOMES_DATA = """%s
}""" % _SEARCH_HOMES_DATA_BASE }""" % _SEARCH_HOMES_DATA_BASE
SEARCH_HOMES_DATA = """%s SEARCH_HOMES_DATA = """%s
current_estimates { current_estimates {
__typename __typename
source { source {
__typename __typename
type type
name name
} }
estimate estimate
estimateHigh: estimate_high estimateHigh: estimate_high
estimateLow: estimate_low estimateLow: estimate_low
date date
isBestHomeValue: isbest_homevalue isBestHomeValue: isbest_homevalue
} }
}""" % _SEARCH_HOMES_DATA_BASE }""" % _SEARCH_HOMES_DATA_BASE
GENERAL_RESULTS_QUERY = """{ GENERAL_RESULTS_QUERY = """{

View File

@ -33,8 +33,6 @@ ordered_properties = [
"last_sold_date", "last_sold_date",
"assessed_value", "assessed_value",
"estimated_value", "estimated_value",
"tax",
"tax_history",
"new_construction", "new_construction",
"lot_sqft", "lot_sqft",
"price_per_sqft", "price_per_sqft",
@ -117,11 +115,8 @@ def process_result(result: Property) -> pd.DataFrame:
if description: if description:
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = ( prop_data["style"] = description.style if isinstance(description.style,
description.style str) else description.style.value if description.style else None
if isinstance(description.style, str)
else description.style.value if description.style else None
)
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.4.7" version = "0.4.4"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@ -1,5 +1,4 @@
from homeharvest import scrape_property, Property from homeharvest import scrape_property
import pandas as pd
def test_realtor_pending_or_contingent(): def test_realtor_pending_or_contingent():
@ -288,15 +287,3 @@ def test_phone_number_matching():
#: assert phone numbers are the same #: assert phone numbers are the same
assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0] assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]
def test_return_type():
results = {
"pandas": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100),
"pydantic": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="pydantic"),
"raw": scrape_property(location="Surprise, AZ", listing_type="for_rent", limit=100, return_type="raw"),
}
assert isinstance(results["pandas"], pd.DataFrame)
assert isinstance(results["pydantic"][0], Property)
assert isinstance(results["raw"][0], dict)