- primary photo bug fix

- limit parameter
master v0.3.33
Zachary Hampton 2024-07-15 07:19:57 -07:00
parent ac0cad62a7
commit 3f44744d61
6 changed files with 44 additions and 28 deletions

View File

@ -94,7 +94,9 @@ Optional
├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) ├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
└── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' ├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
└── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
``` ```
### Property Schema ### Property Schema

View File

@ -1,7 +1,7 @@
import warnings import warnings
import pandas as pd import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType from .core.scrapers.models import ListingType
@ -18,6 +18,7 @@ def scrape_property(
foreclosure: bool = None, foreclosure: bool = None,
extra_property_data: bool = True, extra_property_data: bool = True,
exclude_pending: bool = False, exclude_pending: bool = False,
limit: int = 10000,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@ -31,9 +32,11 @@ def scrape_property(
:param foreclosure: If set, fetches only foreclosure listings. :param foreclosure: If set, fetches only foreclosure listings.
:param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) :param extra_property_data: Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.)
:param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending. :param exclude_pending: If true, this excludes pending or contingent properties from the results, unless listing type is pending.
:param limit: Limit the number of results returned. Maximum is 10,000.
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to) validate_dates(date_from, date_to)
validate_limit(limit)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
@ -47,6 +50,7 @@ def scrape_property(
foreclosure=foreclosure, foreclosure=foreclosure,
extra_property_data=extra_property_data, extra_property_data=extra_property_data,
exclude_pending=exclude_pending, exclude_pending=exclude_pending,
limit=limit,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)

View File

@ -22,6 +22,7 @@ class ScraperInput:
foreclosure: bool | None = False foreclosure: bool | None = False
extra_property_data: bool | None = True extra_property_data: bool | None = True
exclude_pending: bool | None = False exclude_pending: bool | None = False
limit: int = 10000
class Scraper: class Scraper:
@ -64,6 +65,7 @@ class Scraper:
self.foreclosure = scraper_input.foreclosure self.foreclosure = scraper_input.foreclosure
self.extra_property_data = scraper_input.extra_property_data self.extra_property_data = scraper_input.extra_property_data
self.exclude_pending = scraper_input.exclude_pending self.exclude_pending = scraper_input.exclude_pending
self.limit = scraper_input.limit
def search(self) -> list[Property]: ... def search(self) -> list[Property]: ...

View File

@ -115,10 +115,10 @@ class RealtorScraper(Scraper):
) )
able_to_get_lat_long = ( able_to_get_lat_long = (
property_info property_info
and property_info.get("address") and property_info.get("address")
and property_info["address"].get("location") and property_info["address"].get("location")
and property_info["address"]["location"].get("coordinate") and property_info["address"]["location"].get("coordinate")
) )
list_date_str = ( list_date_str = (
property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None
@ -481,7 +481,7 @@ class RealtorScraper(Scraper):
) )
else: #: general search, came from an address else: #: general search, came from an address
query = ( query = (
"""query Property_search( """query Property_search(
$property_id: [ID]! $property_id: [ID]!
$offset: Int!, $offset: Int!,
) { ) {
@ -492,7 +492,7 @@ class RealtorScraper(Scraper):
limit: 1 limit: 1
offset: $offset offset: $offset
) %s""" ) %s"""
% results_query % results_query
) )
payload = { payload = {
@ -507,12 +507,12 @@ class RealtorScraper(Scraper):
properties: list[Property] = [] properties: list[Property] = []
if ( if (
response_json is None response_json is None
or "data" not in response_json or "data" not in response_json
or response_json["data"] is None or response_json["data"] is None
or search_key not in response_json["data"] or search_key not in response_json["data"]
or response_json["data"][search_key] is None or response_json["data"][search_key] is None
or "results" not in response_json["data"][search_key] or "results" not in response_json["data"][search_key]
): ):
return {"total": 0, "properties": []} return {"total": 0, "properties": []}
@ -523,10 +523,10 @@ class RealtorScraper(Scraper):
return return
able_to_get_lat_long = ( able_to_get_lat_long = (
result result
and result.get("location") and result.get("location")
and result["location"].get("address") and result["location"].get("address")
and result["location"]["address"].get("coordinate") and result["location"]["address"].get("coordinate")
) )
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
@ -654,7 +654,7 @@ class RealtorScraper(Scraper):
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
search_type=search_type, search_type=search_type,
) )
for i in range(200, min(total, 10000), 200) for i in range(200, min(total, self.limit), 200)
] ]
for future in as_completed(futures): for future in as_completed(futures):
@ -790,7 +790,10 @@ class RealtorScraper(Scraper):
) )
@staticmethod @staticmethod
def _parse_description(result: dict) -> Description: def _parse_description(result: dict) -> Description | None:
if not result:
return None
description_data = result.get("description", {}) description_data = result.get("description", {})
if description_data is None or not isinstance(description_data, dict): if description_data is None or not isinstance(description_data, dict):
@ -801,11 +804,8 @@ class RealtorScraper(Scraper):
style = style.upper() style = style.upper()
primary_photo = "" primary_photo = ""
if result and "primary_photo" in result: if (primary_photo_info := result.get('primary_photo')) and (primary_photo_href := primary_photo_info.get("href")):
primary_photo_info = result["primary_photo"] primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
if primary_photo_info and "href" in primary_photo_info:
primary_photo_href = primary_photo_info["href"]
primary_photo = primary_photo_href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75")
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,

View File

@ -86,7 +86,8 @@ def process_result(result: Property) -> pd.DataFrame:
if description: if description:
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if isinstance(description.style, str) else description.style.value if description.style else None prop_data["style"] = description.style if isinstance(description.style,
str) else description.style.value if description.style else None
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half
@ -110,7 +111,7 @@ def validate_input(listing_type: str) -> None:
def validate_dates(date_from: str | None, date_to: str | None) -> None: def validate_dates(date_from: str | None, date_to: str | None) -> None:
if (date_from is not None and date_to is None) or (date_from is None and date_to is not None): if isinstance(date_from, str) != isinstance(date_to, str):
raise InvalidDate("Both date_from and date_to must be provided.") raise InvalidDate("Both date_from and date_to must be provided.")
if date_from and date_to: if date_from and date_to:
@ -122,3 +123,10 @@ def validate_dates(date_from: str | None, date_to: str | None) -> None:
raise InvalidDate("date_to must be after date_from.") raise InvalidDate("date_to must be after date_from.")
except ValueError: except ValueError:
raise InvalidDate(f"Invalid date format or range") raise InvalidDate(f"Invalid date format or range")
def validate_limit(limit: int) -> None:
#: 1 -> 10000 limit
if limit is not None and (limit < 1 or limit > 10000):
raise ValueError("Property limit must be between 1 and 10,000.")

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.32" version = "0.3.33"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"