diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 0373e4b..2b60e3b 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -57,6 +57,10 @@ def _get_ordered_properties(result: Property) -> list[str]: "stories", "year_built", "agent_name", + "agent_phone", + "agent_email", + "days_on_market", + "sold_date", "mls_id", "img_src", "latitude", @@ -84,6 +88,18 @@ def _process_result(result: Property) -> pd.DataFrame: del prop_data["address"] + if "agent" in prop_data and prop_data["agent"] is not None: + agent_data = prop_data["agent"] + prop_data["agent_name"] = agent_data.name + prop_data["agent_phone"] = agent_data.phone + prop_data["agent_email"] = agent_data.email + + del prop_data["agent"] + else: + prop_data["agent_name"] = None + prop_data["agent_phone"] = None + prop_data["agent_email"] = None + properties_df = pd.DataFrame([prop_data]) properties_df = properties_df[_get_ordered_properties(result)] diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index cd79e6b..ed75999 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from enum import Enum from typing import Tuple +from datetime import datetime class SiteName(Enum): @@ -64,6 +65,13 @@ class Address: zip_code: str | None = None +@dataclass +class Agent: + name: str + phone: str | None = None + email: str | None = None + + @dataclass class Property: property_url: str @@ -81,11 +89,11 @@ class Property: price_per_sqft: int | None = None mls_id: str | None = None - agent_name: str | None = None + agent: Agent | None = None img_src: str | None = None description: str | None = None status_text: str | None = None - posted_time: str | None = None + posted_time: datetime | None = None # building for sale bldg_name: str | None = None @@ -107,3 +115,6 @@ class Property: latitude: float | None = None longitude: float | None = None + + sold_date: datetime | None = None + days_on_market: int | None = None diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index 62942e0..80b91f8 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -8,8 +8,9 @@ import json from typing import Any from .. import Scraper from ....utils import parse_address_two, parse_address_one -from ..models import Property, Address, PropertyType, ListingType, SiteName -from ....exceptions import NoResultsFound +from ..models import Property, Address, PropertyType, ListingType, SiteName, Agent +from ....exceptions import NoResultsFound, SearchTooBroad +from datetime import datetime class RedfinScraper(Scraper): @@ -30,6 +31,8 @@ class RedfinScraper(Scraper): return "6" #: city elif match_type == "1": return "address" #: address, needs to be handled differently + elif match_type == "11": + return "state" if "exactMatch" not in response_json["payload"]: raise NoResultsFound("No results found for location: {}".format(self.location)) @@ -74,6 +77,8 @@ class RedfinScraper(Scraper): else: lot_size = lot_size_data + lat_long = get_value("latLong") + return Property( site_name=self.site_name, listing_type=self.listing_type, @@ -88,15 +93,20 @@ class RedfinScraper(Scraper): sqft_min=get_value("sqFt"), sqft_max=get_value("sqFt"), stories=home["stories"] if "stories" in home else None, - agent_name=get_value("listingAgent"), + agent=Agent( #: listingAgent, some have sellingAgent as well + name=home['listingAgent'].get('name') if 'listingAgent' in home else None, + phone=home['listingAgent'].get('phone') if 'listingAgent' in home else None, + ), description=home["listingRemarks"] if "listingRemarks" in home else None, year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"), lot_area_value=lot_size, property_type=PropertyType.from_int_code(home.get("propertyType")), price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"), mls_id=get_value("mlsId"), - latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None, - longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None, + latitude=lat_long.get('latitude') if lat_long else None, + longitude=lat_long.get('longitude') if lat_long else None, + sold_date=datetime.fromtimestamp(home['soldDate'] / 1000) if 'soldDate' in home else None, + days_on_market=get_value("dom") ) def _handle_rentals(self, region_id, region_type): @@ -207,6 +217,9 @@ class RedfinScraper(Scraper): def search(self): region_id, region_type = self._handle_location() + if region_type == "state": + raise SearchTooBroad("State searches are not supported, please use a more specific location.") + if region_type == "address": home_id = region_id return self.handle_address(home_id) diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 067adb8..5911e98 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -9,7 +9,7 @@ import json from .. import Scraper from ....utils import parse_address_one, parse_address_two from ....exceptions import GeoCoordsNotFound, NoResultsFound -from ..models import Property, Address, ListingType, PropertyType +from ..models import Property, Address, ListingType, PropertyType, Agent class ZillowScraper(Scraper): @@ -165,10 +165,10 @@ class ZillowScraper(Scraper): home_info["statusType"] if "statusType" in home_info else self.listing_type ), status_text=result.get("statusText"), - posted_time=result["variableData"]["text"] + posted_time=result["variableData"]["text"] #: TODO: change to datetime if "variableData" in result - and "text" in result["variableData"] - and result["variableData"]["type"] == "TIME_ON_INFO" + and "text" in result["variableData"] + and result["variableData"]["type"] == "TIME_ON_INFO" else None, price_min=home_info.get("price"), price_max=home_info.get("price"), @@ -246,7 +246,9 @@ class ZillowScraper(Scraper): tax_assessed_value=property_data.get("taxAssessedValue"), lot_area_value=property_data.get("lotAreaValue"), lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None, - agent_name=property_data.get("attributionInfo", {}).get("agentName"), + agent=Agent( + name=property_data.get("attributionInfo", {}).get("agentName") + ), stories=property_data.get("resoFacts", {}).get("stories"), mls_id=property_data.get("attributionInfo", {}).get("mlsId"), beds_min=property_data.get("bedrooms"), @@ -298,20 +300,21 @@ class ZillowScraper(Scraper): def _get_headers(self): headers = { - "authority": "www.zillow.com", - "accept": "*/*", - "accept-language": "en-US,en;q=0.9", - "content-type": "application/json", - "origin": "https://www.zillow.com", - "referer": "https://www.zillow.com", - "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', - "sec-ch-ua-mobile": "?0", - "sec-ch-ua-platform": '"Windows"', - "sec-fetch-dest": "empty", - "sec-fetch-mode": "cors", - "sec-fetch-site": "same-origin", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + 'authority': 'www.zillow.com', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'accept-language': 'en-US,en;q=0.9', + 'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'document', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-site': 'none', + 'sec-fetch-user': '?1', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', } + if self.cookies: headers['Cookie'] = self.cookies + return headers diff --git a/homeharvest/exceptions.py b/homeharvest/exceptions.py index cd18640..95eedbc 100644 --- a/homeharvest/exceptions.py +++ b/homeharvest/exceptions.py @@ -12,3 +12,7 @@ class NoResultsFound(Exception): class GeoCoordsNotFound(Exception): """Raised when no property is found for the given address""" + + +class SearchTooBroad(Exception): + """Raised when the search is too broad""" diff --git a/tests/test_redfin.py b/tests/test_redfin.py index b55b442..6904499 100644 --- a/tests/test_redfin.py +++ b/tests/test_redfin.py @@ -4,11 +4,13 @@ from homeharvest.exceptions import ( InvalidListingType, NoResultsFound, GeoCoordsNotFound, + SearchTooBroad, ) def test_redfin(): results = [ + scrape_property(location="San Diego", site_name="redfin", listing_type="for_sale"), scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"), scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"), scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"), @@ -24,9 +26,10 @@ def test_redfin(): location="abceefg ju098ot498hh9", site_name="redfin", listing_type="for_sale", - ) + ), + scrape_property(location="Florida", site_name="redfin", listing_type="for_rent"), ] - except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound): + except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound, SearchTooBroad): assert True assert all([result is None for result in bad_results])