mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-04 19:44:29 -08:00
feat: add pandas
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
from .core.scrapers.redfin import RedfinScraper
|
||||
from .core.scrapers.realtor import RealtorScraper
|
||||
from .core.scrapers.zillow import ZillowScraper
|
||||
from .core.scrapers.models import ListingType, Property, Building
|
||||
from .core.scrapers.models import ListingType, Property, Building, SiteName
|
||||
from .core.scrapers import ScraperInput
|
||||
from .exceptions import InvalidSite, InvalidListingType
|
||||
from typing import Union
|
||||
import pandas as pd
|
||||
|
||||
|
||||
_scrapers = {
|
||||
@@ -18,7 +19,7 @@ def scrape_property(
|
||||
location: str,
|
||||
site_name: str,
|
||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
||||
) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe
|
||||
) -> Union[list[Building], list[Property]]:
|
||||
if site_name.lower() not in _scrapers:
|
||||
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
||||
|
||||
@@ -30,8 +31,69 @@ def scrape_property(
|
||||
scraper_input = ScraperInput(
|
||||
location=location,
|
||||
listing_type=ListingType[listing_type.upper()],
|
||||
site_name=SiteName[site_name.upper()],
|
||||
)
|
||||
|
||||
site = _scrapers[site_name.lower()](scraper_input)
|
||||
results = site.search()
|
||||
|
||||
return site.search()
|
||||
properties_dfs = []
|
||||
|
||||
for result in results:
|
||||
prop_data = result.__dict__
|
||||
|
||||
address_data = prop_data["address"]
|
||||
prop_data["site_name"] = prop_data["site_name"].value
|
||||
prop_data["listing_type"] = prop_data["listing_type"].value
|
||||
prop_data["property_type"] = prop_data["property_type"].value.lower()
|
||||
prop_data["address_one"] = address_data.address_one
|
||||
prop_data["city"] = address_data.city
|
||||
prop_data["state"] = address_data.state
|
||||
prop_data["zip_code"] = address_data.zip_code
|
||||
prop_data["address_two"] = address_data.address_two
|
||||
|
||||
del prop_data["address"]
|
||||
|
||||
if isinstance(result, Property):
|
||||
desired_order = [
|
||||
"listing_type",
|
||||
"address_one",
|
||||
"city",
|
||||
"state",
|
||||
"zip_code",
|
||||
"address_two",
|
||||
"url",
|
||||
"property_type",
|
||||
"price",
|
||||
"beds",
|
||||
"baths",
|
||||
"square_feet",
|
||||
"price_per_square_foot",
|
||||
"lot_size",
|
||||
"stories",
|
||||
"year_built",
|
||||
"agent_name",
|
||||
"mls_id",
|
||||
"description",
|
||||
]
|
||||
|
||||
elif isinstance(result, Building):
|
||||
desired_order = [
|
||||
"address_one",
|
||||
"city",
|
||||
"state",
|
||||
"zip_code",
|
||||
"address_two",
|
||||
"url",
|
||||
"num_units",
|
||||
"min_unit_price",
|
||||
"max_unit_price",
|
||||
"avg_unit_price",
|
||||
"listing_type",
|
||||
]
|
||||
|
||||
properties_df = pd.DataFrame([prop_data])
|
||||
properties_df = properties_df[desired_order]
|
||||
properties_dfs.append(properties_df)
|
||||
|
||||
return pd.concat(properties_dfs, ignore_index=True)
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
from dataclasses import dataclass
|
||||
import requests
|
||||
from .models import Property, ListingType
|
||||
from .models import Property, ListingType, SiteName
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperInput:
|
||||
location: str
|
||||
listing_type: ListingType
|
||||
site_name: SiteName
|
||||
proxy_url: str | None = None
|
||||
|
||||
|
||||
@@ -14,6 +15,8 @@ class Scraper:
|
||||
def __init__(self, scraper_input: ScraperInput):
|
||||
self.location = scraper_input.location
|
||||
self.session = requests.Session()
|
||||
self.listing_type = scraper_input.listing_type
|
||||
self.site_name = scraper_input.site_name
|
||||
|
||||
if scraper_input.proxy_url:
|
||||
self.session.proxies = {
|
||||
|
||||
@@ -2,12 +2,43 @@ from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SiteName(Enum):
|
||||
ZILLOW = "zillow"
|
||||
REDFIN = "redfin"
|
||||
REALTOR = "realtor.com"
|
||||
|
||||
|
||||
class ListingType(Enum):
|
||||
FOR_SALE = "for_sale"
|
||||
FOR_RENT = "for_rent"
|
||||
SOLD = "sold"
|
||||
|
||||
|
||||
class PropertyType(Enum):
|
||||
HOUSE = "HOUSE"
|
||||
CONDO = "CONDO"
|
||||
TOWNHOUSE = "townhousE"
|
||||
SINGLE_FAMILY = "SINGLE_FAMILY"
|
||||
MULTI_FAMILY = "MULTI_FAMILY"
|
||||
LAND = "LAND"
|
||||
OTHER = "OTHER"
|
||||
|
||||
@classmethod
|
||||
def from_int_code(cls, code):
|
||||
mapping = {
|
||||
1: cls.HOUSE,
|
||||
2: cls.CONDO,
|
||||
3: cls.TOWNHOUSE,
|
||||
4: cls.MULTI_FAMILY,
|
||||
5: cls.LAND,
|
||||
6: cls.OTHER,
|
||||
8: cls.SINGLE_FAMILY,
|
||||
13: cls.SINGLE_FAMILY,
|
||||
}
|
||||
|
||||
return mapping.get(code, cls.OTHER)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Address:
|
||||
address_one: str
|
||||
@@ -18,35 +49,35 @@ class Address:
|
||||
address_two: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Property:
|
||||
@dataclass()
|
||||
class Realty:
|
||||
site_name: SiteName
|
||||
address: Address
|
||||
url: str
|
||||
listing_type: ListingType | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Property(Realty):
|
||||
price: int | None = None
|
||||
beds: int | None = None
|
||||
baths: float | None = None
|
||||
stories: int | None = None
|
||||
agent_name: str | None = None
|
||||
year_built: int | None = None
|
||||
square_feet: int | None = None
|
||||
price_per_square_foot: int | None = None
|
||||
year_built: int | None = None
|
||||
price: int | None = None
|
||||
mls_id: str | None = None
|
||||
|
||||
listing_type: ListingType | None = None
|
||||
agent_name: str | None = None
|
||||
property_type: PropertyType | None = None
|
||||
lot_size: int | None = None
|
||||
description: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Building:
|
||||
address: Address
|
||||
url: str
|
||||
|
||||
class Building(Realty):
|
||||
num_units: int | None = None
|
||||
min_unit_price: int | None = None
|
||||
max_unit_price: int | None = None
|
||||
avg_unit_price: int | None = None
|
||||
|
||||
listing_type: str | None = None
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import json
|
||||
from ..models import Property, Address
|
||||
from ..models import Property, Address, PropertyType
|
||||
from .. import Scraper
|
||||
from typing import Any
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import Any
|
||||
class RedfinScraper(Scraper):
|
||||
def __init__(self, scraper_input):
|
||||
super().__init__(scraper_input)
|
||||
self.listing_type = scraper_input.listing_type
|
||||
|
||||
def _handle_location(self):
|
||||
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
|
||||
@@ -31,8 +32,7 @@ class RedfinScraper(Scraper):
|
||||
|
||||
return target["id"].split("_")[1], get_region_type(target["type"])
|
||||
|
||||
@staticmethod
|
||||
def _parse_home(home: dict, single_search: bool = False) -> Property:
|
||||
def _parse_home(self, home: dict, single_search: bool = False) -> Property:
|
||||
def get_value(key: str) -> Any | None:
|
||||
if key in home and "value" in home[key]:
|
||||
return home[key]["value"]
|
||||
@@ -53,10 +53,12 @@ class RedfinScraper(Scraper):
|
||||
state=home["state"],
|
||||
zip_code=home["zip"],
|
||||
)
|
||||
|
||||
url = "https://www.redfin.com{}".format(home["url"])
|
||||
property_type = home["propertyType"] if "propertyType" in home else None
|
||||
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
listing_type=self.listing_type,
|
||||
address=address,
|
||||
url=url,
|
||||
beds=home["beds"] if "beds" in home else None,
|
||||
@@ -68,6 +70,8 @@ class RedfinScraper(Scraper):
|
||||
if not single_search
|
||||
else home["yearBuilt"],
|
||||
square_feet=get_value("sqFt"),
|
||||
lot_size=home.get("lotSize", {}).get("value", None),
|
||||
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
||||
price_per_square_foot=get_value("pricePerSqFt"),
|
||||
price=get_value("price"),
|
||||
mls_id=get_value("mlsId"),
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
import re
|
||||
import json
|
||||
from ..models import Property, Address, Building, ListingType
|
||||
from ..models import Property, Address, Building, ListingType, PropertyType
|
||||
from ....exceptions import NoResultsFound, PropertyNotFound
|
||||
from .. import Scraper
|
||||
|
||||
|
||||
class ZillowScraper(Scraper):
|
||||
listing_type: ListingType.FOR_SALE
|
||||
|
||||
def __init__(self, scraper_input):
|
||||
super().__init__(scraper_input)
|
||||
self.listing_type = scraper_input.listing_type
|
||||
@@ -65,15 +63,17 @@ class ZillowScraper(Scraper):
|
||||
agent_name = self._extract_agent_name(home)
|
||||
beds = home["hdpData"]["homeInfo"]["bedrooms"]
|
||||
baths = home["hdpData"]["homeInfo"]["bathrooms"]
|
||||
listing_type = home["hdpData"]["homeInfo"].get("homeType")
|
||||
property_type = home["hdpData"]["homeInfo"].get("homeType")
|
||||
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
address=address,
|
||||
agent_name=agent_name,
|
||||
url=url,
|
||||
beds=beds,
|
||||
baths=baths,
|
||||
listing_type=listing_type,
|
||||
listing_type=self.listing_type,
|
||||
property_type=PropertyType(property_type),
|
||||
**price_data,
|
||||
)
|
||||
else:
|
||||
@@ -83,10 +83,11 @@ class ZillowScraper(Scraper):
|
||||
address = Address(address_one, city, state, zip_code, address_two)
|
||||
|
||||
building_info = self._extract_building_info(home)
|
||||
return Building(address=address, url=url, **building_info)
|
||||
return Building(
|
||||
site_name=self.site_name, address=address, url=url, **building_info
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_single_property_page(cls, property_data: dict):
|
||||
def _get_single_property_page(self, property_data: dict):
|
||||
"""
|
||||
This method is used when a user enters the exact location & zillow returns just one property
|
||||
"""
|
||||
@@ -104,8 +105,11 @@ class ZillowScraper(Scraper):
|
||||
state=address_data["state"],
|
||||
zip_code=address_data["zipcode"],
|
||||
)
|
||||
property_type = property_data.get("homeType", None)
|
||||
print(property_type)
|
||||
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
address=address,
|
||||
url=url,
|
||||
beds=property_data.get("bedrooms", None),
|
||||
@@ -121,7 +125,8 @@ class ZillowScraper(Scraper):
|
||||
"pricePerSquareFoot", None
|
||||
),
|
||||
square_feet=property_data.get("livingArea", None),
|
||||
listing_type=property_data.get("homeType", None),
|
||||
property_type=PropertyType(property_type),
|
||||
listing_type=self.listing_type,
|
||||
)
|
||||
|
||||
def _extract_building_info(self, home: dict) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user