parent
9200c17df2
commit
01c53f9399
|
@ -57,6 +57,10 @@ def _get_ordered_properties(result: Property) -> list[str]:
|
||||||
"stories",
|
"stories",
|
||||||
"year_built",
|
"year_built",
|
||||||
"agent_name",
|
"agent_name",
|
||||||
|
"agent_phone",
|
||||||
|
"agent_email",
|
||||||
|
"days_on_market",
|
||||||
|
"sold_date",
|
||||||
"mls_id",
|
"mls_id",
|
||||||
"img_src",
|
"img_src",
|
||||||
"latitude",
|
"latitude",
|
||||||
|
@ -84,6 +88,18 @@ def _process_result(result: Property) -> pd.DataFrame:
|
||||||
|
|
||||||
del prop_data["address"]
|
del prop_data["address"]
|
||||||
|
|
||||||
|
if "agent" in prop_data and prop_data["agent"] is not None:
|
||||||
|
agent_data = prop_data["agent"]
|
||||||
|
prop_data["agent_name"] = agent_data.name
|
||||||
|
prop_data["agent_phone"] = agent_data.phone
|
||||||
|
prop_data["agent_email"] = agent_data.email
|
||||||
|
|
||||||
|
del prop_data["agent"]
|
||||||
|
else:
|
||||||
|
prop_data["agent_name"] = None
|
||||||
|
prop_data["agent_phone"] = None
|
||||||
|
prop_data["agent_email"] = None
|
||||||
|
|
||||||
properties_df = pd.DataFrame([prop_data])
|
properties_df = pd.DataFrame([prop_data])
|
||||||
properties_df = properties_df[_get_ordered_properties(result)]
|
properties_df = properties_df[_get_ordered_properties(result)]
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
class SiteName(Enum):
|
class SiteName(Enum):
|
||||||
|
@ -64,6 +65,13 @@ class Address:
|
||||||
zip_code: str | None = None
|
zip_code: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Agent:
|
||||||
|
name: str
|
||||||
|
phone: str | None = None
|
||||||
|
email: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Property:
|
class Property:
|
||||||
property_url: str
|
property_url: str
|
||||||
|
@ -81,11 +89,11 @@ class Property:
|
||||||
price_per_sqft: int | None = None
|
price_per_sqft: int | None = None
|
||||||
mls_id: str | None = None
|
mls_id: str | None = None
|
||||||
|
|
||||||
agent_name: str | None = None
|
agent: Agent | None = None
|
||||||
img_src: str | None = None
|
img_src: str | None = None
|
||||||
description: str | None = None
|
description: str | None = None
|
||||||
status_text: str | None = None
|
status_text: str | None = None
|
||||||
posted_time: str | None = None
|
posted_time: datetime | None = None
|
||||||
|
|
||||||
# building for sale
|
# building for sale
|
||||||
bldg_name: str | None = None
|
bldg_name: str | None = None
|
||||||
|
@ -107,3 +115,6 @@ class Property:
|
||||||
|
|
||||||
latitude: float | None = None
|
latitude: float | None = None
|
||||||
longitude: float | None = None
|
longitude: float | None = None
|
||||||
|
|
||||||
|
sold_date: datetime | None = None
|
||||||
|
days_on_market: int | None = None
|
||||||
|
|
|
@ -8,8 +8,9 @@ import json
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two, parse_address_one
|
from ....utils import parse_address_two, parse_address_one
|
||||||
from ..models import Property, Address, PropertyType, ListingType, SiteName
|
from ..models import Property, Address, PropertyType, ListingType, SiteName, Agent
|
||||||
from ....exceptions import NoResultsFound
|
from ....exceptions import NoResultsFound, SearchTooBroad
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
class RedfinScraper(Scraper):
|
class RedfinScraper(Scraper):
|
||||||
|
@ -30,6 +31,8 @@ class RedfinScraper(Scraper):
|
||||||
return "6" #: city
|
return "6" #: city
|
||||||
elif match_type == "1":
|
elif match_type == "1":
|
||||||
return "address" #: address, needs to be handled differently
|
return "address" #: address, needs to be handled differently
|
||||||
|
elif match_type == "11":
|
||||||
|
return "state"
|
||||||
|
|
||||||
if "exactMatch" not in response_json["payload"]:
|
if "exactMatch" not in response_json["payload"]:
|
||||||
raise NoResultsFound("No results found for location: {}".format(self.location))
|
raise NoResultsFound("No results found for location: {}".format(self.location))
|
||||||
|
@ -74,6 +77,8 @@ class RedfinScraper(Scraper):
|
||||||
else:
|
else:
|
||||||
lot_size = lot_size_data
|
lot_size = lot_size_data
|
||||||
|
|
||||||
|
lat_long = get_value("latLong")
|
||||||
|
|
||||||
return Property(
|
return Property(
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
listing_type=self.listing_type,
|
listing_type=self.listing_type,
|
||||||
|
@ -88,15 +93,20 @@ class RedfinScraper(Scraper):
|
||||||
sqft_min=get_value("sqFt"),
|
sqft_min=get_value("sqFt"),
|
||||||
sqft_max=get_value("sqFt"),
|
sqft_max=get_value("sqFt"),
|
||||||
stories=home["stories"] if "stories" in home else None,
|
stories=home["stories"] if "stories" in home else None,
|
||||||
agent_name=get_value("listingAgent"),
|
agent=Agent( #: listingAgent, some have sellingAgent as well
|
||||||
|
name=home['listingAgent'].get('name') if 'listingAgent' in home else None,
|
||||||
|
phone=home['listingAgent'].get('phone') if 'listingAgent' in home else None,
|
||||||
|
),
|
||||||
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
||||||
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
|
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
|
||||||
lot_area_value=lot_size,
|
lot_area_value=lot_size,
|
||||||
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
||||||
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
|
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
|
latitude=lat_long.get('latitude') if lat_long else None,
|
||||||
longitude=home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None,
|
longitude=lat_long.get('longitude') if lat_long else None,
|
||||||
|
sold_date=datetime.fromtimestamp(home['soldDate'] / 1000) if 'soldDate' in home else None,
|
||||||
|
days_on_market=get_value("dom")
|
||||||
)
|
)
|
||||||
|
|
||||||
def _handle_rentals(self, region_id, region_type):
|
def _handle_rentals(self, region_id, region_type):
|
||||||
|
@ -207,6 +217,9 @@ class RedfinScraper(Scraper):
|
||||||
def search(self):
|
def search(self):
|
||||||
region_id, region_type = self._handle_location()
|
region_id, region_type = self._handle_location()
|
||||||
|
|
||||||
|
if region_type == "state":
|
||||||
|
raise SearchTooBroad("State searches are not supported, please use a more specific location.")
|
||||||
|
|
||||||
if region_type == "address":
|
if region_type == "address":
|
||||||
home_id = region_id
|
home_id = region_id
|
||||||
return self.handle_address(home_id)
|
return self.handle_address(home_id)
|
||||||
|
|
|
@ -9,7 +9,7 @@ import json
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_one, parse_address_two
|
from ....utils import parse_address_one, parse_address_two
|
||||||
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||||
from ..models import Property, Address, ListingType, PropertyType
|
from ..models import Property, Address, ListingType, PropertyType, Agent
|
||||||
|
|
||||||
|
|
||||||
class ZillowScraper(Scraper):
|
class ZillowScraper(Scraper):
|
||||||
|
@ -165,7 +165,7 @@ class ZillowScraper(Scraper):
|
||||||
home_info["statusType"] if "statusType" in home_info else self.listing_type
|
home_info["statusType"] if "statusType" in home_info else self.listing_type
|
||||||
),
|
),
|
||||||
status_text=result.get("statusText"),
|
status_text=result.get("statusText"),
|
||||||
posted_time=result["variableData"]["text"]
|
posted_time=result["variableData"]["text"] #: TODO: change to datetime
|
||||||
if "variableData" in result
|
if "variableData" in result
|
||||||
and "text" in result["variableData"]
|
and "text" in result["variableData"]
|
||||||
and result["variableData"]["type"] == "TIME_ON_INFO"
|
and result["variableData"]["type"] == "TIME_ON_INFO"
|
||||||
|
@ -246,7 +246,9 @@ class ZillowScraper(Scraper):
|
||||||
tax_assessed_value=property_data.get("taxAssessedValue"),
|
tax_assessed_value=property_data.get("taxAssessedValue"),
|
||||||
lot_area_value=property_data.get("lotAreaValue"),
|
lot_area_value=property_data.get("lotAreaValue"),
|
||||||
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
|
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
|
||||||
agent_name=property_data.get("attributionInfo", {}).get("agentName"),
|
agent=Agent(
|
||||||
|
name=property_data.get("attributionInfo", {}).get("agentName")
|
||||||
|
),
|
||||||
stories=property_data.get("resoFacts", {}).get("stories"),
|
stories=property_data.get("resoFacts", {}).get("stories"),
|
||||||
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
|
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
|
||||||
beds_min=property_data.get("bedrooms"),
|
beds_min=property_data.get("bedrooms"),
|
||||||
|
@ -298,20 +300,21 @@ class ZillowScraper(Scraper):
|
||||||
|
|
||||||
def _get_headers(self):
|
def _get_headers(self):
|
||||||
headers = {
|
headers = {
|
||||||
"authority": "www.zillow.com",
|
'authority': 'www.zillow.com',
|
||||||
"accept": "*/*",
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
"accept-language": "en-US,en;q=0.9",
|
'accept-language': 'en-US,en;q=0.9',
|
||||||
"content-type": "application/json",
|
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
||||||
"origin": "https://www.zillow.com",
|
'sec-ch-ua-mobile': '?0',
|
||||||
"referer": "https://www.zillow.com",
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
'sec-fetch-dest': 'document',
|
||||||
"sec-ch-ua-mobile": "?0",
|
'sec-fetch-mode': 'navigate',
|
||||||
"sec-ch-ua-platform": '"Windows"',
|
'sec-fetch-site': 'none',
|
||||||
"sec-fetch-dest": "empty",
|
'sec-fetch-user': '?1',
|
||||||
"sec-fetch-mode": "cors",
|
'upgrade-insecure-requests': '1',
|
||||||
"sec-fetch-site": "same-origin",
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.cookies:
|
if self.cookies:
|
||||||
headers['Cookie'] = self.cookies
|
headers['Cookie'] = self.cookies
|
||||||
|
|
||||||
return headers
|
return headers
|
||||||
|
|
|
@ -12,3 +12,7 @@ class NoResultsFound(Exception):
|
||||||
|
|
||||||
class GeoCoordsNotFound(Exception):
|
class GeoCoordsNotFound(Exception):
|
||||||
"""Raised when no property is found for the given address"""
|
"""Raised when no property is found for the given address"""
|
||||||
|
|
||||||
|
|
||||||
|
class SearchTooBroad(Exception):
|
||||||
|
"""Raised when the search is too broad"""
|
||||||
|
|
|
@ -4,11 +4,13 @@ from homeharvest.exceptions import (
|
||||||
InvalidListingType,
|
InvalidListingType,
|
||||||
NoResultsFound,
|
NoResultsFound,
|
||||||
GeoCoordsNotFound,
|
GeoCoordsNotFound,
|
||||||
|
SearchTooBroad,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
results = [
|
results = [
|
||||||
|
scrape_property(location="San Diego", site_name="redfin", listing_type="for_sale"),
|
||||||
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
|
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"),
|
||||||
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
|
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"),
|
||||||
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
|
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"),
|
||||||
|
@ -24,9 +26,10 @@ def test_redfin():
|
||||||
location="abceefg ju098ot498hh9",
|
location="abceefg ju098ot498hh9",
|
||||||
site_name="redfin",
|
site_name="redfin",
|
||||||
listing_type="for_sale",
|
listing_type="for_sale",
|
||||||
)
|
),
|
||||||
|
scrape_property(location="Florida", site_name="redfin", listing_type="for_rent"),
|
||||||
]
|
]
|
||||||
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):
|
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound, SearchTooBroad):
|
||||||
assert True
|
assert True
|
||||||
|
|
||||||
assert all([result is None for result in bad_results])
|
assert all([result is None for result in bad_results])
|
||||||
|
|
Loading…
Reference in New Issue