diff --git a/README.md b/README.md index 3f6a65f..76a1618 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Optional │ ├── proxy (string): In format 'http://user:pass@host:port' │ -├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data (e.g. agent, broker, property evaluations etc.) +├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data for general searches (e.g. schools, tax appraisals etc.) │ ├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending' │ @@ -123,6 +123,7 @@ Property │ ├── sqft │ ├── year_built │ ├── stories +│ ├── garage │ └── lot_sqft ├── Property Listing Details: @@ -135,7 +136,7 @@ Property │ ├── sold_price │ ├── last_sold_date │ ├── price_per_sqft -│ ├── parking_garage +│ ├── new_construction │ └── hoa_fee ├── Location Details: @@ -143,21 +144,31 @@ Property │ ├── longitude │ ├── nearby_schools - ├── Agent Info: -│ ├── agent +│ ├── agent_id +│ ├── agent_name │ ├── agent_email │ └── agent_phone ├── Broker Info: -│ ├── broker -│ ├── broker_email -│ └── broker_website +│ ├── broker_id +│ └── broker_name + +├── Builder Info: +│ ├── builder_id +│ └── builder_name + +├── Office Info: +│ ├── office_id +│ ├── office_name +│ ├── office_phones +│ └── office_email + ``` ### Exceptions The following exceptions may be raised when using HomeHarvest: -- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` +- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`, `pending`. - `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD. - `AuthenticationError` - Realtor.com token request failed. diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 15b6da7..64308e1 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -90,18 +90,41 @@ class AgentPhone: #: For documentation purposes only (at the moment) @dataclass -class Agent: - name: str | None = None +class Entity: + name: str + uuid: str | None = None + + +@dataclass +class Agent(Entity): phones: list[dict] | AgentPhone | None = None email: str | None = None href: str | None = None @dataclass -class Broker: - name: str | None = None - phone: str | None = None - website: str | None = None +class Office(Entity): + email: str | None = None + href: str | None = None + phones: list[dict] | AgentPhone | None = None + + +@dataclass +class Broker(Entity): + pass + + +@dataclass +class Builder(Entity): + pass + + +@dataclass +class Advertisers: + agent: Agent | None = None + broker: Broker | None = None + builder: Builder | None = None + office: Office | None = None @dataclass @@ -120,6 +143,7 @@ class Property: pending_date: str | None = None last_sold_date: str | None = None prc_sqft: int | None = None + new_construction: bool | None = None hoa_fee: int | None = None days_on_mls: int | None = None description: Description | None = None @@ -129,8 +153,8 @@ class Property: neighborhoods: Optional[str] = None county: Optional[str] = None fips_code: Optional[str] = None - agents: list[Agent] | None = None - brokers: list[Broker] | None = None nearby_schools: list[str] = None assessed_value: int | None = None estimated_value: int | None = None + + advertisers: Advertisers | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 7045512..6b05d78 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -11,7 +11,8 @@ from datetime import datetime from typing import Dict, Union, Optional from .. import Scraper -from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker +from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker, Builder, Advertisers, Office +from .queries import GENERAL_RESULTS_QUERY, SEARCH_HOMES_DATA, HOMES_DATA class RealtorScraper(Scraper): @@ -46,155 +47,6 @@ class RealtorScraper(Scraper): return result[0] - def handle_listing(self, listing_id: str) -> list[Property]: - query = """query Listing($listing_id: ID!) { - listing(id: $listing_id) { - source { - id - listing_id - } - address { - line - street_direction - street_number - street_name - street_suffix - unit - city - state_code - postal_code - location { - coordinate { - lat - lon - } - } - } - basic { - sqft - beds - baths_full - baths_half - lot_sqft - sold_price - type - price - status - sold_date - list_date - } - details { - year_built - stories - garage - permalink - } - media { - photos { - href - } - } - } - }""" - - variables = {"listing_id": listing_id} - payload = { - "query": query, - "variables": variables, - } - - response = self.session.post(self.SEARCH_GQL_URL, json=payload) - response_json = response.json() - - property_info = response_json["data"]["listing"] - - mls = ( - property_info["source"].get("id") - if "source" in property_info and isinstance(property_info["source"], dict) - else None - ) - - able_to_get_lat_long = ( - property_info - and property_info.get("address") - and property_info["address"].get("location") - and property_info["address"]["location"].get("coordinate") - ) - list_date_str = ( - property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None - ) - last_sold_date_str = ( - property_info["basic"]["sold_date"].split("T")[0] if property_info["basic"].get("sold_date") else None - ) - pending_date_str = property_info["pending_date"].split("T")[0] if property_info.get("pending_date") else None - - list_date = datetime.strptime(list_date_str, "%Y-%m-%d") if list_date_str else None - last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None - pending_date = datetime.strptime(pending_date_str, "%Y-%m-%d") if pending_date_str else None - today = datetime.now() - - days_on_mls = None - status = property_info["basic"]["status"].lower() - if list_date: - if status == "sold" and last_sold_date: - days_on_mls = (last_sold_date - list_date).days - elif status in ("for_sale", "for_rent"): - days_on_mls = (today - list_date).days - if days_on_mls and days_on_mls < 0: - days_on_mls = None - - property_id = property_info["details"]["permalink"] - prop_details = self.get_prop_details(property_id) - style = property_info["basic"].get("type", "").upper() - listing = Property( - mls=mls, - mls_id=( - property_info["source"].get("listing_id") - if "source" in property_info and isinstance(property_info["source"], dict) - else None - ), - property_url=f"{self.PROPERTY_URL}{property_id}", - status=property_info["basic"]["status"].upper(), - list_price=property_info["basic"]["price"], - list_date=list_date, - prc_sqft=( - property_info["basic"].get("price") / property_info["basic"].get("sqft") - if property_info["basic"].get("price") and property_info["basic"].get("sqft") - else None - ), - last_sold_date=last_sold_date, - pending_date=pending_date, - latitude=property_info["address"]["location"]["coordinate"].get("lat") if able_to_get_lat_long else None, - longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None, - address=self._parse_address(property_info, search_type="handle_listing"), - description=Description( - alt_photos=( - self.process_alt_photos(property_info["media"].get("photos", [])) - if property_info.get("media") - else None - ), - style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None, - beds=property_info["basic"].get("beds"), - baths_full=property_info["basic"].get("baths_full"), - baths_half=property_info["basic"].get("baths_half"), - sqft=property_info["basic"].get("sqft"), - lot_sqft=property_info["basic"].get("lot_sqft"), - sold_price=property_info["basic"].get("sold_price"), - year_built=property_info["details"].get("year_built"), - garage=property_info["details"].get("garage"), - stories=property_info["details"].get("stories"), - text=property_info.get("description", {}).get("text"), - ), - days_on_mls=days_on_mls, - agents=prop_details.get("agents"), - brokers=prop_details.get("brokers"), - nearby_schools=prop_details.get("schools"), - assessed_value=prop_details.get("assessed_value"), - estimated_value=prop_details.get("estimated_value"), - ) - - return [listing] - def get_latest_listing_id(self, property_id: str) -> str | None: query = """query Property($property_id: ID!) { property(id: $property_id) { @@ -228,65 +80,12 @@ class RealtorScraper(Scraper): else: return property_info["listings"][0]["listing_id"] - def handle_address(self, property_id: str) -> list[Property]: - """ - Handles a specific address & returns one property - """ - query = """query Property($property_id: ID!) { - property(id: $property_id) { - property_id - details { - date_updated - garage - permalink - year_built - stories - } - address { - line - street_direction - street_number - street_name - street_suffix - unit - city - state_code - postal_code - location { - coordinate { - lat - lon - } - } - } - basic { - baths - beds - price - sqft - lot_sqft - type - sold_price - } - public_record { - lot_size - sqft - stories - units - year_built - } - primary_photo { - href - } - photos { - href - } - } - }""" + def handle_home(self, property_id: str) -> list[Property]: + query = """query Home($property_id: ID!) { + home(property_id: $property_id) %s + }""" % HOMES_DATA variables = {"property_id": property_id} - prop_details = self.get_prop_details(property_id) - payload = { "query": query, "variables": variables, @@ -295,103 +94,123 @@ class RealtorScraper(Scraper): response = self.session.post(self.SEARCH_GQL_URL, json=payload) response_json = response.json() - property_info = response_json["data"]["property"] + property_info = response_json["data"]["home"] return [ - Property( - mls_id=property_id, - property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}", - address=self._parse_address(property_info, search_type="handle_address"), - description=self._parse_description(property_info), - agents=prop_details.get("agents"), - brokers=prop_details.get("brokers"), - nearby_schools=prop_details.get("schools"), - assessed_value=prop_details.get("assessed_value"), - estimated_value=prop_details.get("estimated_value"), - ) + self.process_property(property_info, "home") ] + @staticmethod + def process_advertisers(advertisers: list[dict] | None) -> Advertisers | None: + if not advertisers: + return None + + def _parse_fulfillment_id(fulfillment_id: str | None) -> str | None: + return fulfillment_id if fulfillment_id and fulfillment_id != "0" else None + + processed_advertisers = Advertisers() + + for advertiser in advertisers: + advertiser_type = advertiser.get("type") + if advertiser_type == "seller": #: agent + processed_advertisers.agent = Agent( + uuid=advertiser.get("mls_set"), + name=advertiser.get("name"), + email=advertiser.get("email"), + phones=advertiser.get("phones"), + ) + + if advertiser.get('broker') and advertiser["broker"].get('name'): #: has a broker + processed_advertisers.broker = Broker( + uuid=_parse_fulfillment_id(advertiser["broker"].get("fulfillment_id")), + name=advertiser["broker"].get("name"), + ) + + if advertiser.get("office"): #: has an office + processed_advertisers.office = Office( + uuid=_parse_fulfillment_id(advertiser["office"].get("fulfillment_id")) or advertiser["office"].get("mls_set"), + name=advertiser["office"].get("name"), + email=advertiser["office"].get("email"), + phones=advertiser["office"].get("phones"), + ) + + if advertiser_type == "community": #: could be builder + if advertiser.get("builder"): + processed_advertisers.builder = Builder( + uuid=_parse_fulfillment_id(advertiser["builder"].get("fulfillment_id")), + name=advertiser["builder"].get("name"), + ) + + return processed_advertisers + + def process_property(self, result: dict, query_name: str) -> Property | None: + mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None + + if not mls and self.mls_only: + return + + able_to_get_lat_long = ( + result + and result.get("location") + and result["location"].get("address") + and result["location"]["address"].get("coordinate") + ) + + is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") + + if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING): + return + + property_id = result["property_id"] + prop_details = self.get_prop_details(property_id) if self.extra_property_data and query_name != "home" else {} + if not prop_details: + prop_details = self.process_extra_property_details(result) + + property_estimates_root = result.get("current_estimates") or result.get("estimates", {}).get("currentValues") + estimated_value = self.get_key(property_estimates_root, [0, "estimate"]) + + advertisers = self.process_advertisers(result.get("advertisers")) + + realty_property = Property( + mls=mls, + mls_id=( + result["source"].get("listing_id") + if "source" in result and isinstance(result["source"], dict) + else None + ), + property_url=( + f"{self.PROPERTY_URL}{property_id}" + if self.listing_type != ListingType.FOR_RENT + else f"{self.PROPERTY_URL}M{property_id}?listing_status=rental" + ), + status="PENDING" if is_pending else result["status"].upper(), + list_price=result["list_price"], + list_price_min=result["list_price_min"], + list_price_max=result["list_price_max"], + list_date=result["list_date"].split("T")[0] if result.get("list_date") else None, + prc_sqft=result.get("price_per_sqft"), + last_sold_date=result.get("last_sold_date"), + new_construction=result["flags"].get("is_new_construction") is True, + hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None, + latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None, + longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None, + address=self._parse_address(result, search_type="general_search"), + description=self._parse_description(result), + neighborhoods=self._parse_neighborhoods(result), + county=result["location"]["county"].get("name") if result["location"]["county"] else None, + fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None, + days_on_mls=self.calculate_days_on_mls(result), + nearby_schools=prop_details.get("schools"), + assessed_value=prop_details.get("assessed_value"), + estimated_value=estimated_value if estimated_value else None, + advertisers=advertisers, + ) + return realty_property + def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]: """ Handles a location area & returns a list of properties """ - results_query = """{ - count - total - results { - pending_date - property_id - list_date - status - last_sold_price - last_sold_date - list_price - list_price_max - list_price_min - price_per_sqft - flags { - is_contingent - is_pending - } - description { - type - sqft - beds - baths_full - baths_half - lot_sqft - sold_price - year_built - garage - sold_price - type - name - stories - text - } - source { - id - listing_id - } - hoa { - fee - } - location { - address { - street_direction - street_number - street_name - street_suffix - line - unit - city - state_code - postal_code - coordinate { - lon - lat - } - } - county { - name - fips_code - } - neighborhoods { - name - } - } - tax_record { - public_record_id - } - primary_photo { - href - } - photos { - href - } - } - } - }""" date_param = "" if self.listing_type == ListingType.SOLD: @@ -443,13 +262,14 @@ class RealtorScraper(Scraper): %s limit: 200 offset: $offset - ) %s""" % ( + ) %s + }""" % ( is_foreclosure, listing_type.value.lower(), date_param, pending_or_contingent_param, sort_param, - results_query, + GENERAL_RESULTS_QUERY, ) elif search_type == "area": #: general search, came from a general location query = """query Home_search( @@ -473,13 +293,14 @@ class RealtorScraper(Scraper): %s limit: 200 offset: $offset - ) %s""" % ( + ) %s + }""" % ( is_foreclosure, listing_type.value.lower(), date_param, pending_or_contingent_param, sort_param, - results_query, + GENERAL_RESULTS_QUERY, ) else: #: general search, came from an address query = ( @@ -487,14 +308,15 @@ class RealtorScraper(Scraper): $property_id: [ID]! $offset: Int!, ) { - property_search( + home_search( query: { property_id: $property_id } limit: 1 offset: $offset - ) %s""" - % results_query + ) %s + }""" + % GENERAL_RESULTS_QUERY ) payload = { @@ -518,63 +340,6 @@ class RealtorScraper(Scraper): ): return {"total": 0, "properties": []} - def process_property(result: dict) -> Property | None: - mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None - - if not mls and self.mls_only: - return - - able_to_get_lat_long = ( - result - and result.get("location") - and result["location"].get("address") - and result["location"]["address"].get("coordinate") - ) - - is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") - - if is_pending and (self.exclude_pending and self.listing_type != ListingType.PENDING): - return - - property_id = result["property_id"] - prop_details = self.get_prop_details(property_id) if self.extra_property_data else {} - - realty_property = Property( - mls=mls, - mls_id=( - result["source"].get("listing_id") - if "source" in result and isinstance(result["source"], dict) - else None - ), - property_url=( - f"{self.PROPERTY_URL}{property_id}" - if self.listing_type != ListingType.FOR_RENT - else f"{self.PROPERTY_URL}M{property_id}?listing_status=rental" - ), - status="PENDING" if is_pending else result["status"].upper(), - list_price=result["list_price"], - list_price_min=result["list_price_min"], - list_price_max=result["list_price_max"], - list_date=result["list_date"].split("T")[0] if result.get("list_date") else None, - prc_sqft=result.get("price_per_sqft"), - last_sold_date=result.get("last_sold_date"), - hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None, - latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None, - longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None, - address=self._parse_address(result, search_type="general_search"), - description=self._parse_description(result), - neighborhoods=self._parse_neighborhoods(result), - county=result["location"]["county"].get("name") if result["location"]["county"] else None, - fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None, - days_on_mls=self.calculate_days_on_mls(result), - agents=prop_details.get("agents"), - brokers=prop_details.get("brokers"), - nearby_schools=prop_details.get("schools"), - assessed_value=prop_details.get("assessed_value"), - estimated_value=prop_details.get("estimated_value"), - ) - return realty_property - properties_list = response_json["data"][search_key]["results"] total_properties = response_json["data"][search_key]["total"] offset = variables.get("offset", 0) @@ -585,7 +350,7 @@ class RealtorScraper(Scraper): with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor: futures = [ - executor.submit(process_property, result) for result in properties_list + executor.submit(self.process_property, result, search_key) for result in properties_list ] for future in as_completed(futures): @@ -617,17 +382,7 @@ class RealtorScraper(Scraper): if location_type == "address": if not self.radius: #: single address search, non comps property_id = location_info["mpr_id"] - search_variables |= {"property_id": property_id} - - gql_results = self.general_search(search_variables, search_type=search_type) - if gql_results["total"] == 0: - listing_id = self.get_latest_listing_id(property_id) - if listing_id is None: - return self.handle_address(property_id) - else: - return self.handle_listing(listing_id) - else: - return gql_results["properties"] + return self.handle_home(property_id) else: #: general search, comps (radius) if not location_info.get("centroid"): @@ -674,87 +429,49 @@ class RealtorScraper(Scraper): return homes + @staticmethod + def get_key(data: dict, keys: list): + try: + value = data + for key in keys: + value = value[key] + + return value or {} + except (KeyError, TypeError, IndexError): + return {} + + def process_extra_property_details(self, result: dict) -> dict: + schools = self.get_key(result, ["nearbySchools", "schools"]) + assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"]) + + schools = [school["district"]["name"] for school in schools if school["district"].get("name")] + return { + "schools": schools if schools else None, + "assessed_value": assessed_value if assessed_value else None, + } + def get_prop_details(self, property_id: str) -> dict: if not self.extra_property_data: return {} - #: TODO: migrate "advertisers" and "estimates" to general query - query = """query GetHome($property_id: ID!) { home(property_id: $property_id) { __typename - advertisers { - __typename - type - name - email - phones { number type ext primary } - } - - consumer_advertisers { - name - phone - href - type - } - nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { __typename schools { district { __typename id name } } } taxHistory: tax_history { __typename tax year assessment { __typename building land total } } - estimates { - __typename - currentValues: current_values { - __typename - source { __typename type name } - estimate - estimateHigh: estimate_high - estimateLow: estimate_low - date - isBestHomeValue: isbest_homevalue - } - } } }""" variables = {"property_id": property_id} response = self.session.post(self.PROPERTY_GQL, json={"query": query, "variables": variables}) + data = response.json() + property_details = data["data"]["home"] - def get_key(keys: list): - try: - value = data - for key in keys: - value = value[key] - - return value or {} - except (KeyError, TypeError, IndexError): - return {} - - agents = get_key(["data", "home", "advertisers"]) - advertisers = get_key(["data", "home", "consumer_advertisers"]) - - schools = get_key(["data", "home", "nearbySchools", "schools"]) - assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"]) - estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"]) - - agents = [Agent(name=ad["name"], email=ad["email"], phones=ad["phones"]) for ad in agents] - - brokers = [ - Broker(name=ad["name"], phone=ad["phone"], website=ad["href"]) - for ad in advertisers - if ad.get("type") != "Agent" - ] - - schools = [school["district"]["name"] for school in schools if school["district"].get("name")] - return { - "agents": agents if agents else None, - "brokers": brokers if brokers else None, - "schools": schools if schools else None, - "assessed_value": assessed_value if assessed_value else None, - "estimated_value": estimated_value if estimated_value else None, - } + return self.process_extra_property_details(property_details) @staticmethod def _parse_neighborhoods(result: dict) -> Optional[str]: @@ -829,7 +546,7 @@ class RealtorScraper(Scraper): sqft=description_data.get("sqft"), lot_sqft=description_data.get("lot_sqft"), sold_price=( - description_data.get("sold_price") + result.get('last_sold_price') or description_data.get("sold_price") if result.get("last_sold_date") or result["list_price"] != description_data.get("sold_price") else None ), #: has a sold date or list and sold price are different @@ -859,14 +576,8 @@ class RealtorScraper(Scraper): return days @staticmethod - def process_alt_photos(photos_info): - try: - alt_photos = [] - if photos_info: - for photo_info in photos_info: - href = photo_info.get("href", "") - alt_photo_href = href.replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") - alt_photos.append(alt_photo_href) - return alt_photos - except Exception: - pass + def process_alt_photos(photos_info: list[dict]) -> list[str] | None: + if not photos_info: + return None + + return [photo_info["href"].replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75") for photo_info in photos_info if photo_info.get("href")] diff --git a/homeharvest/core/scrapers/realtor/queries.py b/homeharvest/core/scrapers/realtor/queries.py new file mode 100644 index 0000000..0ecade8 --- /dev/null +++ b/homeharvest/core/scrapers/realtor/queries.py @@ -0,0 +1,161 @@ +_SEARCH_HOMES_DATA_BASE = """{ + pending_date + listing_id + property_id + list_date + status + last_sold_price + last_sold_date + list_price + list_price_max + list_price_min + price_per_sqft + flags { + is_contingent + is_pending + is_new_construction + } + description { + type + sqft + beds + baths_full + baths_half + lot_sqft + year_built + garage + type + name + stories + text + } + source { + id + listing_id + } + hoa { + fee + } + location { + address { + street_direction + street_number + street_name + street_suffix + line + unit + city + state_code + postal_code + coordinate { + lon + lat + } + } + county { + name + fips_code + } + neighborhoods { + name + } + } + tax_record { + public_record_id + } + primary_photo { + href + } + photos { + href + } + advertisers { + email + broker { + name + fulfillment_id + } + type + name + fulfillment_id + builder { + name + fulfillment_id + } + phones { + ext + primary + type + number + } + office { + name + email + fulfillment_id + href + phones { + number + type + primary + ext + } + mls_set + } + corporation { + specialties + name + bio + href + fulfillment_id + } + mls_set + rental_corporation { + fulfillment_id + } + rental_management { + name + fulfillment_id + } + } + """ + +HOMES_DATA = """%s + nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) { + __typename schools { district { __typename id name } } + } + taxHistory: tax_history { __typename tax year assessment { __typename building land total } } + estimates { + __typename + currentValues: current_values { + __typename + source { __typename type name } + estimate + estimateHigh: estimate_high + estimateLow: estimate_low + date + isBestHomeValue: isbest_homevalue + } + } +}""" % _SEARCH_HOMES_DATA_BASE + +SEARCH_HOMES_DATA = """%s + current_estimates { + __typename + source { + __typename + type + name + } + estimate + estimateHigh: estimate_high + estimateLow: estimate_low + date + isBestHomeValue: isbest_homevalue + } +}""" % _SEARCH_HOMES_DATA_BASE + +GENERAL_RESULTS_QUERY = """{ + count + total + results %s + }""" % SEARCH_HOMES_DATA diff --git a/homeharvest/utils.py b/homeharvest/utils.py index a84d399..fb7fa89 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -1,7 +1,7 @@ from __future__ import annotations import pandas as pd from datetime import datetime -from .core.scrapers.models import Property, ListingType, Agent +from .core.scrapers.models import Property, ListingType, Advertisers from .exceptions import InvalidListingType, InvalidDate ordered_properties = [ @@ -31,6 +31,7 @@ ordered_properties = [ "last_sold_date", "assessed_value", "estimated_value", + "new_construction", "lot_sqft", "price_per_sqft", "latitude", @@ -41,12 +42,18 @@ ordered_properties = [ "stories", "hoa_fee", "parking_garage", - "agent", + "agent_id", + "agent_name", "agent_email", "agent_phones", - "broker", - "broker_phone", - "broker_website", + "broker_id", + "broker_name", + "builder_id", + "builder_name", + "office_id", + "office_name", + "office_email", + "office_phones", "nearby_schools", "primary_photo", "alt_photos", @@ -66,19 +73,31 @@ def process_result(result: Property) -> pd.DataFrame: prop_data["state"] = address_data.state prop_data["zip_code"] = address_data.zip - if "agents" in prop_data: - agents: list[Agent] | None = prop_data["agents"] - if agents: - prop_data["agent"] = agents[0].name - prop_data["agent_email"] = agents[0].email - prop_data["agent_phones"] = agents[0].phones + if "advertisers" in prop_data and prop_data.get("advertisers"): + advertiser_data: Advertisers | None = prop_data["advertisers"] + if advertiser_data.agent: + agent_data = advertiser_data.agent + prop_data["agent_id"] = agent_data.uuid + prop_data["agent_name"] = agent_data.name + prop_data["agent_email"] = agent_data.email + prop_data["agent_phones"] = agent_data.phones - if "brokers" in prop_data: - brokers = prop_data["brokers"] - if brokers: - prop_data["broker"] = brokers[0].name - prop_data["broker_phone"] = brokers[0].phone - prop_data["broker_website"] = brokers[0].website + if advertiser_data.broker: + broker_data = advertiser_data.broker + prop_data["broker_id"] = broker_data.uuid + prop_data["broker_name"] = broker_data.name + + if advertiser_data.builder: + builder_data = advertiser_data.builder + prop_data["builder_id"] = builder_data.uuid + prop_data["builder_name"] = builder_data.name + + if advertiser_data.office: + office_data = advertiser_data.office + prop_data["office_id"] = office_data.uuid + prop_data["office_name"] = office_data.name + prop_data["office_email"] = office_data.email + prop_data["office_phones"] = office_data.phones prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None diff --git a/pyproject.toml b/pyproject.toml index a5589d9..940bebc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.3.34" +version = "0.4.0" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 7387a77..cfac518 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -141,18 +141,23 @@ def test_realtor_foreclosed(): def test_realtor_agent(): - scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000) - assert scraped["agent"].nunique() > 1 + scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000, extra_property_data=False) + assert scraped["agent_name"].nunique() > 1 def test_realtor_without_extra_details(): results = [ scrape_property( - location="15509 N 172nd Dr, Surprise, AZ 85388", + location="00741", + listing_type="sold", + limit=10, extra_property_data=False, ), scrape_property( - location="15509 N 172nd Dr, Surprise, AZ 85388", + location="00741", + listing_type="sold", + limit=10, + extra_property_data=True, ), ]