[enh]: clean data

pull/30/head
Cullen Watson 2023-10-03 21:16:38 -05:00
parent 8388d47f73
commit 1464b4f7d4
13 changed files with 353 additions and 897 deletions

View File

@ -3,139 +3,60 @@ from typing import Union
import concurrent.futures import concurrent.futures
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from .utils import process_result, ordered_properties
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper from .core.scrapers.models import Status, Property, SiteName
from .core.scrapers.models import ListingType, Property, SiteName
from .exceptions import InvalidSite, InvalidListingType from .exceptions import InvalidSite, InvalidListingType
_scrapers = { _scrapers = {
"redfin": RedfinScraper,
"realtor.com": RealtorScraper, "realtor.com": RealtorScraper,
"zillow": ZillowScraper,
} }
def _validate_input(site_name: str, listing_type: str) -> None: def _validate_input(site_name: str, status: str) -> None:
if site_name.lower() not in _scrapers: if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.") raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
if listing_type.upper() not in ListingType.__members__: if status.upper() not in Status.__members__:
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.") raise InvalidListingType(f"Provided listing type, '{status}', does not exist.")
def _get_ordered_properties(result: Property) -> list[str]: def _scrape_single_site(
return [ location: str, site_name: str, status: str, proxy: str = None, timeframe: str = None
"property_url", ) -> pd.DataFrame:
"site_name",
"listing_type",
"property_type",
"status_text",
"baths_min",
"baths_max",
"beds_min",
"beds_max",
"sqft_min",
"sqft_max",
"price_min",
"price_max",
"unit_count",
"tax_assessed_value",
"price_per_sqft",
"lot_area_value",
"lot_area_unit",
"address_one",
"address_two",
"city",
"state",
"zip_code",
"posted_time",
"area_min",
"bldg_name",
"stories",
"year_built",
"agent_name",
"agent_phone",
"agent_email",
"days_on_market",
"sold_date",
"mls_id",
"img_src",
"latitude",
"longitude",
"description",
]
def _process_result(result: Property) -> pd.DataFrame:
prop_data = result.__dict__
prop_data["site_name"] = prop_data["site_name"].value
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
if "property_type" in prop_data and prop_data["property_type"] is not None:
prop_data["property_type"] = prop_data["property_type"].value.lower()
else:
prop_data["property_type"] = None
if "address" in prop_data:
address_data = prop_data["address"]
prop_data["address_one"] = address_data.address_one
prop_data["address_two"] = address_data.address_two
prop_data["city"] = address_data.city
prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code
del prop_data["address"]
if "agent" in prop_data and prop_data["agent"] is not None:
agent_data = prop_data["agent"]
prop_data["agent_name"] = agent_data.name
prop_data["agent_phone"] = agent_data.phone
prop_data["agent_email"] = agent_data.email
del prop_data["agent"]
else:
prop_data["agent_name"] = None
prop_data["agent_phone"] = None
prop_data["agent_email"] = None
properties_df = pd.DataFrame([prop_data])
properties_df = properties_df[_get_ordered_properties(result)]
return properties_df
def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame:
""" """
Helper function to scrape a single site. Helper function to scrape a single site.
""" """
_validate_input(site_name, listing_type) print(status)
_validate_input(site_name, status)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
listing_type=ListingType[listing_type.upper()], status=status,
site_name=SiteName.get_by_value(site_name.lower()), site_name=SiteName.get_by_value(site_name.lower()),
proxy=proxy, proxy=proxy,
timeframe=timeframe,
) )
site = _scrapers[site_name.lower()](scraper_input) site = _scrapers[site_name.lower()](scraper_input)
results = site.search() results = site.search()
print(f"Found {len(results)} results for {site_name}")
properties_dfs = [_process_result(result) for result in results] properties_dfs = [process_result(result) for result in results]
properties_dfs = [df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty]
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
return pd.concat(properties_dfs, ignore_index=True) return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
def scrape_property( def scrape_property(
location: str, location: str,
timeframe: str,
site_name: Union[str, list[str]] = None, site_name: Union[str, list[str]] = None,
listing_type: str = "for_sale", status: str = "sale",
proxy: str = None, proxy: str = None,
keep_duplicates: bool = False
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape property from various sites from a given location and listing type. Scrape property from various sites from a given location and listing type.
@ -155,12 +76,14 @@ def scrape_property(
results = [] results = []
if len(site_name) == 1: if len(site_name) == 1:
final_df = _scrape_single_site(location, site_name[0], listing_type, proxy) final_df = _scrape_single_site(location, site_name[0], status, proxy, timeframe)
results.append(final_df) results.append(final_df)
else: else:
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
futures = { futures = {
executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name executor.submit(
_scrape_single_site, location, s_name, status, proxy, timeframe
): s_name
for s_name in site_name for s_name in site_name
} }
@ -175,13 +98,11 @@ def scrape_property(
final_df = pd.concat(results, ignore_index=True) final_df = pd.concat(results, ignore_index=True)
columns_to_track = ["address_one", "address_two", "city"] columns_to_track = ["Street", "Unit", "Zip"]
#: validate they exist, otherwise create them #: validate they exist, otherwise create them
for col in columns_to_track: for col in columns_to_track:
if col not in final_df.columns: if col not in final_df.columns:
final_df[col] = None final_df[col] = None
if not keep_duplicates:
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
return final_df return final_df

View File

@ -5,7 +5,9 @@ from homeharvest import scrape_property
def main(): def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper") parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)") parser.add_argument(
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
)
parser.add_argument( parser.add_argument(
"-s", "-s",
@ -46,14 +48,22 @@ def main():
"-k", "-k",
"--keep_duplicates", "--keep_duplicates",
action="store_true", action="store_true",
help="Keep duplicate properties based on address" help="Keep duplicate properties based on address",
) )
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping") parser.add_argument(
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
)
args = parser.parse_args() args = parser.parse_args()
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates) result = scrape_property(
args.location,
args.site_name,
args.listing_type,
proxy=args.proxy,
keep_duplicates=args.keep_duplicates,
)
if not args.filename: if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

View File

@ -1,21 +1,38 @@
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
import tls_client import tls_client
from .models import Property, ListingType, SiteName from typing import Optional
from .models import Property, SiteName, Status
from ...exceptions import InvalidTimeFrame
VALID_TIMEFRAMES = ["1W", "1M", "3M", "6M", "1Y"]
VALID_STATUSES = ["sold", "for_sale", "for_rent"]
@dataclass @dataclass
class ScraperInput: class ScraperInput:
location: str location: str
listing_type: ListingType status: str
site_name: SiteName site_name: str
proxy: str | None = None proxy: Optional[str] = None
timeframe: Optional[str] = None
def __post_init__(self):
if self.timeframe and self.timeframe not in VALID_TIMEFRAMES:
raise InvalidTimeFrame(f"Invalid timeframe provided: {self.timeframe}")
if self.status and self.status not in VALID_STATUSES:
raise InvalidTimeFrame(f"Invalid status provided: {self.status}")
class Scraper: class Scraper:
def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_client.Session = None): def __init__(
self,
scraper_input: ScraperInput,
session: requests.Session | tls_client.Session = None,
):
self.location = scraper_input.location self.location = scraper_input.location
self.listing_type = scraper_input.listing_type self.status = scraper_input.status
self.timeframe = scraper_input.timeframe
if not session: if not session:
self.session = requests.Session() self.session = requests.Session()
@ -27,7 +44,7 @@ class Scraper:
proxies = {"http": proxy_url, "https": proxy_url} proxies = {"http": proxy_url, "https": proxy_url}
self.session.proxies.update(proxies) self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type self.listing_type = scraper_input.status
self.site_name = scraper_input.site_name self.site_name = scraper_input.site_name
def search(self) -> list[Property]: def search(self) -> list[Property]:

View File

@ -1,7 +1,6 @@
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Tuple from typing import Optional
from datetime import datetime
class SiteName(Enum): class SiteName(Enum):
@ -17,52 +16,19 @@ class SiteName(Enum):
raise ValueError(f"{value} not found in {cls}") raise ValueError(f"{value} not found in {cls}")
class ListingType(Enum): class Status(Enum):
FOR_SALE = "FOR_SALE" FOR_SALE = "FOR_SALE"
FOR_RENT = "FOR_RENT" FOR_RENT = "FOR_RENT"
SOLD = "SOLD" SOLD = "SOLD"
class PropertyType(Enum):
HOUSE = "HOUSE"
BUILDING = "BUILDING"
CONDO = "CONDO"
TOWNHOUSE = "TOWNHOUSE"
SINGLE_FAMILY = "SINGLE_FAMILY"
MULTI_FAMILY = "MULTI_FAMILY"
MANUFACTURED = "MANUFACTURED"
NEW_CONSTRUCTION = "NEW_CONSTRUCTION"
APARTMENT = "APARTMENT"
APARTMENTS = "APARTMENTS"
LAND = "LAND"
LOT = "LOT"
OTHER = "OTHER"
BLANK = "BLANK"
@classmethod
def from_int_code(cls, code):
mapping = {
1: cls.HOUSE,
2: cls.CONDO,
3: cls.TOWNHOUSE,
4: cls.MULTI_FAMILY,
5: cls.LAND,
6: cls.OTHER,
8: cls.SINGLE_FAMILY,
13: cls.SINGLE_FAMILY,
}
return mapping.get(code, cls.BLANK)
@dataclass @dataclass
class Address: class Address:
address_one: str | None = None street: str | None = None
address_two: str | None = "#" unit: str | None = None
city: str | None = None city: str | None = None
state: str | None = None state: str | None = None
zip_code: str | None = None zip: str | None = None
@dataclass @dataclass
@ -74,47 +40,31 @@ class Agent:
@dataclass @dataclass
class Property: class Property:
property_url: str property_url: str | None = None
site_name: SiteName mls: str | None = None
listing_type: ListingType
address: Address
property_type: PropertyType | None = None
# house for sale
tax_assessed_value: int | None = None
lot_area_value: float | None = None
lot_area_unit: str | None = None
stories: int | None = None
year_built: int | None = None
price_per_sqft: int | None = None
mls_id: str | None = None mls_id: str | None = None
status: str | None = None
style: str | None = None
agent: Agent | None = None beds: int | None = None
img_src: str | None = None baths_full: int | None = None
description: str | None = None baths_half: int | None = None
status_text: str | None = None list_price: int | None = None
posted_time: datetime | None = None list_date: str | None = None
sold_price: int | None = None
last_sold_date: str | None = None
prc_sqft: float | None = None
est_sf: int | None = None
lot_sf: int | None = None
hoa_fee: int | None = None
# building for sale address: Address | None = None
bldg_name: str | None = None
area_min: int | None = None
beds_min: int | None = None
beds_max: int | None = None
baths_min: float | None = None
baths_max: float | None = None
sqft_min: int | None = None
sqft_max: int | None = None
price_min: int | None = None
price_max: int | None = None
unit_count: int | None = None
yr_blt: int | None = None
latitude: float | None = None latitude: float | None = None
longitude: float | None = None longitude: float | None = None
sold_date: datetime | None = None stories: int | None = None
days_on_market: int | None = None prkg_gar: float | None = None
neighborhoods: Optional[str] = None

View File

@ -4,10 +4,10 @@ homeharvest.realtor.__init__
This module implements the scraper for relator.com This module implements the scraper for relator.com
""" """
from typing import Dict, Union
from ..models import Property, Address from ..models import Property, Address
from .. import Scraper from .. import Scraper
from ....exceptions import NoResultsFound from ....exceptions import NoResultsFound
from ....utils import parse_address_one, parse_address_two
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -15,9 +15,7 @@ class RealtorScraper(Scraper):
def __init__(self, scraper_input): def __init__(self, scraper_input):
self.counter = 1 self.counter = 1
super().__init__(scraper_input) super().__init__(scraper_input)
self.search_url = ( self.endpoint = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
"https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
)
def handle_location(self): def handle_location(self):
headers = { headers = {
@ -37,7 +35,7 @@ class RealtorScraper(Scraper):
params = { params = {
"input": self.location, "input": self.location,
"client_id": self.listing_type.value.lower().replace("_", "-"), "client_id": self.listing_type.lower().replace("_", "-"),
"limit": "1", "limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
} }
@ -68,7 +66,6 @@ class RealtorScraper(Scraper):
garage garage
permalink permalink
year_built year_built
stories
} }
address { address {
address_validation_code address_validation_code
@ -100,7 +97,6 @@ class RealtorScraper(Scraper):
public_record { public_record {
lot_size lot_size
sqft sqft
stories
units units
year_built year_built
} }
@ -114,56 +110,48 @@ class RealtorScraper(Scraper):
"variables": variables, "variables": variables,
} }
response = self.session.post(self.search_url, json=payload) response = self.session.post(self.endpoint, json=payload)
response_json = response.json() response_json = response.json()
property_info = response_json["data"]["property"] property_info = response_json["data"]["property"]
address_one, address_two = parse_address_one(property_info["address"]["line"])
return [ return [
Property( Property(
site_name=self.site_name,
address=Address(
address_one=address_one,
address_two=address_two,
city=property_info["address"]["city"],
state=property_info["address"]["state_code"],
zip_code=property_info["address"]["postal_code"],
),
property_url="https://www.realtor.com/realestateandhomes-detail/" property_url="https://www.realtor.com/realestateandhomes-detail/"
+ property_info["details"]["permalink"], + property_info["details"]["permalink"],
stories=property_info["details"]["stories"], address=Address(
year_built=property_info["details"]["year_built"], street=f"{property_info['address']['street_number']} {property_info['address']['street_name']} {property_info['address']['street_suffix']}",
price_per_sqft=property_info["basic"]["price"] // property_info["basic"]["sqft"] unit=property_info["address"]["unit_value"],
if property_info["basic"]["sqft"] is not None and property_info["basic"]["price"] is not None city=property_info["address"]["city"],
state=property_info["address"]["state_code"],
zip=property_info["address"]["postal_code"],
),
yr_blt=property_info["details"]["year_built"],
prc_sqft=property_info["basic"]["price"]
// property_info["basic"]["sqft"]
if property_info["basic"]["sqft"] is not None
and property_info["basic"]["price"] is not None
else None, else None,
mls_id=property_id, status=self.status.upper(),
listing_type=self.listing_type, beds=property_info["basic"]["beds"],
lot_area_value=property_info["public_record"]["lot_size"] baths_full=property_info["basic"]["baths"],
if property_info["public_record"] is not None lot_sf=property_info["basic"]["lot_sqft"],
else None, est_sf=property_info["basic"]["sqft"],
beds_min=property_info["basic"]["beds"], list_price=property_info["basic"]["price"],
beds_max=property_info["basic"]["beds"], sold_price=property_info["basic"]["sold_price"],
baths_min=property_info["basic"]["baths"],
baths_max=property_info["basic"]["baths"],
sqft_min=property_info["basic"]["sqft"],
sqft_max=property_info["basic"]["sqft"],
price_min=property_info["basic"]["price"],
price_max=property_info["basic"]["price"],
) )
] ]
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: def handle_area(self, variables: dict) -> Dict[str, Union[int, list[Property]]]:
""" """
Handles a location area & returns a list of properties Handles a location area & returns a list of properties
""" """
query = ( query = """query Home_search(
"""query Home_search(
$city: String, $city: String,
$county: [String], $county: [String],
$state_code: String, $state_code: String,
$postal_code: String $postal_code: String,
$offset: Int, $offset: Int
) { ) {
home_search( home_search(
query: { query: {
@ -172,26 +160,66 @@ class RealtorScraper(Scraper):
postal_code: $postal_code postal_code: $postal_code
state_code: $state_code state_code: $state_code
status: %s status: %s
sold_date: {
min: %s
}
} }
limit: 200 limit: 200
offset: $offset offset: $offset
sort: [
{
field: sold_date,
direction: desc
}
]
) { ) {
count count
total total
results { results {
property_id property_id
list_date
status
last_sold_price
last_sold_date
hoa {
fee
}
description { description {
baths baths_full
baths_half
beds beds
lot_sqft lot_sqft
sqft sqft
text
sold_price sold_price
stories
year_built year_built
garage garage
unit_number sold_price
floor_number type
sub_type
name
stories
}
source {
raw {
area
status
style
}
last_update_date
contract_date
id
listing_id
name
type
listing_href
community_id
management_id
corporation_id
subdivision_status
spec_id
plan_id
tier_rank
feed_type
} }
location { location {
address { address {
@ -201,41 +229,44 @@ class RealtorScraper(Scraper):
postal_code postal_code
state_code state_code
state state
coordinate {
lon
lat
}
street_direction street_direction
street_name street_name
street_number street_number
street_post_direction street_post_direction
street_suffix street_suffix
unit unit
coordinate {
lon
lat
} }
neighborhoods {
name
} }
} }
list_price list_price
price_per_sqft price_per_sqft
style_category_tags {
exterior}
source { source {
id id
} }
} }
} }
}""" }""" % (
% self.listing_type.value.lower() self.status,
f'"$nowUTC-{self.timeframe}"',
) )
payload = { payload = {
"query": query, "query": query,
"variables": variables, "variables": variables,
} }
response = self.session.post(self.endpoint, json=payload)
response = self.session.post(self.search_url, json=payload)
response.raise_for_status() response.raise_for_status()
response_json = response.json() response_json = response.json()
if return_total:
return response_json["data"]["home_search"]["total"]
properties: list[Property] = [] properties: list[Property] = []
if ( if (
@ -246,19 +277,67 @@ class RealtorScraper(Scraper):
or response_json["data"]["home_search"] is None or response_json["data"]["home_search"] is None
or "results" not in response_json["data"]["home_search"] or "results" not in response_json["data"]["home_search"]
): ):
return [] return {"total": 0, "properties": []}
for result in response_json["data"]["home_search"]["results"]: for result in response_json["data"]["home_search"]["results"]:
self.counter += 1 self.counter += 1
address_one, _ = parse_address_one(result["location"]["address"]["line"]) mls = (
result["source"].get("id")
if "source" in result and isinstance(result["source"], dict)
else None
)
mls_id = (
result["source"].get("listing_id")
if "source" in result and isinstance(result["source"], dict)
else None
)
if not mls_id:
continue
# not type
neighborhoods_list = []
neighborhoods = result["location"].get("neighborhoods", [])
if neighborhoods:
for neighborhood in neighborhoods:
name = neighborhood.get("name")
if name:
neighborhoods_list.append(name)
neighborhoods_str = (
", ".join(neighborhoods_list) if neighborhoods_list else None
)
realty_property = Property( realty_property = Property(
property_url="https://www.realtor.com/realestateandhomes-detail/"
+ result["property_id"],
mls=mls,
mls_id=mls_id,
# status=(result["source"]["raw"].get("status").upper() if 'source' in result and isinstance(result["source"], dict) and "raw" in result["source"] and isinstance(result["source"]["raw"], dict) else None),
status=result["status"].upper(),
style=result["description"]["type"].upper(),
beds=result["description"]["beds"],
baths_full=result["description"]["baths_full"],
baths_half=result["description"]["baths_half"],
est_sf=result["description"]["sqft"],
lot_sf=result["description"]["lot_sqft"],
list_price=result["list_price"],
list_date=result["list_date"].split("T")[0],
sold_price=result["description"]["sold_price"],
prc_sqft=result["price_per_sqft"],
last_sold_date=result["last_sold_date"],
hoa_fee=result["hoa"]["fee"]
if result.get("hoa") and isinstance(result["hoa"], dict)
else None,
address=Address( address=Address(
address_one=address_one, street=f"{result['location']['address']['street_number']} {result['location']['address']['street_name']} {result['location']['address']['street_suffix']}",
unit=result["location"]["address"]["unit"],
city=result["location"]["address"]["city"], city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"], state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"], zip=result["location"]["address"]["postal_code"],
address_two=parse_address_two(result["location"]["address"]["unit"]),
), ),
yr_blt=result["description"]["year_built"],
latitude=result["location"]["address"]["coordinate"]["lat"] latitude=result["location"]["address"]["coordinate"]["lat"]
if result if result
and result.get("location") and result.get("location")
@ -273,26 +352,16 @@ class RealtorScraper(Scraper):
and result["location"]["address"].get("coordinate") and result["location"]["address"].get("coordinate")
and "lon" in result["location"]["address"]["coordinate"] and "lon" in result["location"]["address"]["coordinate"]
else None, else None,
site_name=self.site_name, prkg_gar=result["description"]["garage"],
property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"],
stories=result["description"]["stories"], stories=result["description"]["stories"],
year_built=result["description"]["year_built"], neighborhoods=neighborhoods_str,
price_per_sqft=result["price_per_sqft"],
mls_id=result["property_id"],
listing_type=self.listing_type,
lot_area_value=result["description"]["lot_sqft"],
beds_min=result["description"]["beds"],
beds_max=result["description"]["beds"],
baths_min=result["description"]["baths"],
baths_max=result["description"]["baths"],
sqft_min=result["description"]["sqft"],
sqft_max=result["description"]["sqft"],
price_min=result["list_price"],
price_max=result["list_price"],
) )
properties.append(realty_property) properties.append(realty_property)
return properties return {
"total": response_json["data"]["home_search"]["total"],
"properties": properties,
}
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
@ -311,20 +380,20 @@ class RealtorScraper(Scraper):
"offset": offset, "offset": offset,
} }
total = self.handle_area(search_variables, return_total=True) result = self.handle_area(search_variables)
total = result["total"]
homes = result["properties"]
homes = []
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures = [ futures = [
executor.submit( executor.submit(
self.handle_area, self.handle_area,
variables=search_variables | {"offset": i}, variables=search_variables | {"offset": i},
return_total=False,
) )
for i in range(0, total, 200) for i in range(200, min(total, 10000), 200)
] ]
for future in as_completed(futures): for future in as_completed(futures):
homes.extend(future.result()) homes.extend(future.result()["properties"])
return homes return homes

View File

@ -1,246 +0,0 @@
"""
homeharvest.redfin.__init__
~~~~~~~~~~~~
This module implements the scraper for redfin.com
"""
import json
from typing import Any
from .. import Scraper
from ....utils import parse_address_two, parse_address_one
from ..models import Property, Address, PropertyType, ListingType, SiteName, Agent
from ....exceptions import NoResultsFound, SearchTooBroad
from datetime import datetime
class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
self.listing_type = scraper_input.listing_type
def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(self.location)
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
def get_region_type(match_type: str):
if match_type == "4":
return "2" #: zip
elif match_type == "2":
return "6" #: city
elif match_type == "1":
return "address" #: address, needs to be handled differently
elif match_type == "11":
return "state"
if "exactMatch" not in response_json["payload"]:
raise NoResultsFound("No results found for location: {}".format(self.location))
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
else:
target = response_json["payload"]["sections"][0]["rows"][0]
return target["id"].split("_")[1], get_region_type(target["type"])
def _parse_home(self, home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and "value" in home[key]:
return home[key]["value"]
if not single_search:
address = Address(
address_one=parse_address_one(get_value("streetLine"))[0],
address_two=parse_address_one(get_value("streetLine"))[1],
city=home.get("city"),
state=home.get("state"),
zip_code=home.get("zip"),
)
else:
address_info = home.get("streetAddress")
address_one, address_two = parse_address_one(address_info.get("assembledAddress"))
address = Address(
address_one=address_one,
address_two=address_two,
city=home.get("city"),
state=home.get("state"),
zip_code=home.get("zip"),
)
url = "https://www.redfin.com{}".format(home["url"])
lot_size_data = home.get("lotSize")
if not isinstance(lot_size_data, int):
lot_size = lot_size_data.get("value", None) if isinstance(lot_size_data, dict) else None
else:
lot_size = lot_size_data
lat_long = get_value("latLong")
return Property(
site_name=self.site_name,
listing_type=self.listing_type,
address=address,
property_url=url,
beds_min=home["beds"] if "beds" in home else None,
beds_max=home["beds"] if "beds" in home else None,
baths_min=home["baths"] if "baths" in home else None,
baths_max=home["baths"] if "baths" in home else None,
price_min=get_value("price"),
price_max=get_value("price"),
sqft_min=get_value("sqFt"),
sqft_max=get_value("sqFt"),
stories=home["stories"] if "stories" in home else None,
agent=Agent( #: listingAgent, some have sellingAgent as well
name=home['listingAgent'].get('name') if 'listingAgent' in home else None,
phone=home['listingAgent'].get('phone') if 'listingAgent' in home else None,
),
description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt") if not single_search else home.get("yearBuilt"),
lot_area_value=lot_size,
property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_sqft=get_value("pricePerSqFt") if type(home.get("pricePerSqFt")) != int else home.get("pricePerSqFt"),
mls_id=get_value("mlsId"),
latitude=lat_long.get('latitude') if lat_long else None,
longitude=lat_long.get('longitude') if lat_long else None,
sold_date=datetime.fromtimestamp(home['soldDate'] / 1000) if 'soldDate' in home else None,
days_on_market=get_value("dom")
)
def _handle_rentals(self, region_id, region_type):
url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true&region_id={region_id}&region_type={region_type}&num_homes=100000"
response = self.session.get(url)
response.raise_for_status()
homes = response.json()
properties_list = []
for home in homes["homes"]:
home_data = home["homeData"]
rental_data = home["rentalExtension"]
property_url = f"https://www.redfin.com{home_data.get('url', '')}"
address_info = home_data.get("addressInfo", {})
centroid = address_info.get("centroid", {}).get("centroid", {})
address = Address(
address_one=parse_address_one(address_info.get("formattedStreetLine"))[0],
city=address_info.get("city"),
state=address_info.get("state"),
zip_code=address_info.get("zip"),
)
price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
bed_range = rental_data.get("bedRange", {"min": None, "max": None})
bath_range = rental_data.get("bathRange", {"min": None, "max": None})
sqft_range = rental_data.get("sqftRange", {"min": None, "max": None})
property_ = Property(
property_url=property_url,
site_name=SiteName.REDFIN,
listing_type=ListingType.FOR_RENT,
address=address,
description=rental_data.get("description"),
latitude=centroid.get("latitude"),
longitude=centroid.get("longitude"),
baths_min=bath_range.get("min"),
baths_max=bath_range.get("max"),
beds_min=bed_range.get("min"),
beds_max=bed_range.get("max"),
price_min=price_range.get("min"),
price_max=price_range.get("max"),
sqft_min=sqft_range.get("min"),
sqft_max=sqft_range.get("max"),
img_src=home_data.get("staticMapUrl"),
posted_time=rental_data.get("lastUpdated"),
bldg_name=rental_data.get("propertyName"),
)
properties_list.append(property_)
if not properties_list:
raise NoResultsFound("No rentals found for the given location.")
return properties_list
def _parse_building(self, building: dict) -> Property:
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address(
address_one=parse_address_one(street_address)[0],
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"],
address_two=parse_address_two(
" ".join(
[
building["address"]["unitType"],
building["address"]["unitValue"],
]
)
),
),
property_url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type,
unit_count=building.get("numUnitsForSale"),
)
def handle_address(self, home_id: str):
"""
EPs:
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
parsed_home = self._parse_home(response_json["payload"]["addressSectionInfo"], single_search=True)
return [parsed_home]
def search(self):
region_id, region_type = self._handle_location()
if region_type == "state":
raise SearchTooBroad("State searches are not supported, please use a more specific location.")
if region_type == "address":
home_id = region_id
return self.handle_address(home_id)
if self.listing_type == ListingType.FOR_RENT:
return self._handle_rentals(region_id, region_type)
else:
if self.listing_type == ListingType.FOR_SALE:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&num_homes=100000"
else:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))
if "payload" in response_json:
homes_list = response_json["payload"].get("homes", [])
buildings_list = response_json["payload"].get("buildings", {}).values()
homes = [self._parse_home(home) for home in homes_list] + [
self._parse_building(building) for building in buildings_list
]
return homes
else:
return []

View File

@ -1,335 +0,0 @@
"""
homeharvest.zillow.__init__
~~~~~~~~~~~~
This module implements the scraper for zillow.com
"""
import re
import json
import tls_client
from .. import Scraper
from requests.exceptions import HTTPError
from ....utils import parse_address_one, parse_address_two
from ....exceptions import GeoCoordsNotFound, NoResultsFound
from ..models import Property, Address, ListingType, PropertyType, Agent
import urllib.parse
from datetime import datetime, timedelta
class ZillowScraper(Scraper):
def __init__(self, scraper_input):
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
super().__init__(scraper_input, session)
self.session.headers.update({
'authority': 'www.zillow.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
})
if not self.is_plausible_location(self.location):
raise NoResultsFound("Invalid location input: {}".format(self.location))
listing_type_to_url_path = {
ListingType.FOR_SALE: "for_sale",
ListingType.FOR_RENT: "for_rent",
ListingType.SOLD: "recently_sold",
}
self.url = f"https://www.zillow.com/homes/{listing_type_to_url_path[self.listing_type]}/{self.location}_rb/"
def is_plausible_location(self, location: str) -> bool:
url = (
"https://www.zillowstatic.com/autocomplete/v3/suggestions?q={"
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
).format(urllib.parse.quote(location))
resp = self.session.get(url)
return resp.json()["results"] != []
def search(self):
resp = self.session.get(self.url)
if resp.status_code != 200:
raise HTTPError(
f"bad response status code: {resp.status_code}"
)
content = resp.text
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
content,
re.DOTALL,
)
if not match:
raise NoResultsFound("No results were found for Zillow with the given Location.")
json_str = match.group(1)
data = json.loads(json_str)
if "searchPageState" in data["props"]["pageProps"]:
pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};'
match = re.search(pattern, content)
if match:
coords = [float(coord) for coord in match.groups()]
return self._fetch_properties_backend(coords)
else:
raise GeoCoordsNotFound("Box bounds could not be located.")
elif "gdpClientCache" in data["props"]["pageProps"]:
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
main_key = list(gdp_client_cache.keys())[0]
property_data = gdp_client_cache[main_key]["property"]
property = self._get_single_property_page(property_data)
return [property]
raise NoResultsFound("Specific property data not found in the response.")
def _fetch_properties_backend(self, coords):
url = "https://www.zillow.com/async-create-search-page-state"
filter_state_for_sale = {
"sortSelection": {
# "value": "globalrelevanceex"
"value": "days"
},
"isAllHomes": {"value": True},
}
filter_state_for_rent = {
"isForRent": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
filter_state_sold = {
"isRecentlySold": {"value": True},
"isForSaleByAgent": {"value": False},
"isForSaleByOwner": {"value": False},
"isNewConstruction": {"value": False},
"isComingSoon": {"value": False},
"isAuction": {"value": False},
"isForSaleForeclosure": {"value": False},
"isAllHomes": {"value": True},
}
selected_filter = (
filter_state_for_rent
if self.listing_type == ListingType.FOR_RENT
else filter_state_for_sale
if self.listing_type == ListingType.FOR_SALE
else filter_state_sold
)
payload = {
"searchQueryState": {
"pagination": {},
"isMapVisible": True,
"mapBounds": {
"west": coords[0],
"east": coords[1],
"south": coords[2],
"north": coords[3],
},
"filterState": selected_filter,
"isListVisible": True,
"mapZoom": 11,
},
"wants": {"cat1": ["mapResults"]},
"isDebugRequest": False,
}
resp = self.session.put(url, json=payload)
if resp.status_code != 200:
raise HTTPError(
f"bad response status code: {resp.status_code}"
)
return self._parse_properties(resp.json())
@staticmethod
def parse_posted_time(time: str) -> datetime:
int_time = int(time.split(" ")[0])
if "hour" in time:
return datetime.now() - timedelta(hours=int_time)
if "day" in time:
return datetime.now() - timedelta(days=int_time)
def _parse_properties(self, property_data: dict):
mapresults = property_data["cat1"]["searchResults"]["mapResults"]
properties_list = []
for result in mapresults:
if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"]
address_data = {
"address_one": parse_address_one(home_info.get("streetAddress"))[0],
"address_two": parse_address_two(home_info["unit"]) if "unit" in home_info else "#",
"city": home_info.get("city"),
"state": home_info.get("state"),
"zip_code": home_info.get("zipcode"),
}
property_obj = Property(
site_name=self.site_name,
address=Address(**address_data),
property_url=f"https://www.zillow.com{result['detailUrl']}",
tax_assessed_value=int(home_info["taxAssessedValue"]) if "taxAssessedValue" in home_info else None,
property_type=PropertyType(home_info.get("homeType")),
listing_type=ListingType(
home_info["statusType"] if "statusType" in home_info else self.listing_type
),
status_text=result.get("statusText"),
posted_time=self.parse_posted_time(result["variableData"]["text"])
if "variableData" in result
and "text" in result["variableData"]
and result["variableData"]["type"] == "TIME_ON_INFO"
else None,
price_min=home_info.get("price"),
price_max=home_info.get("price"),
beds_min=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
beds_max=int(home_info["bedrooms"]) if "bedrooms" in home_info else None,
baths_min=home_info.get("bathrooms"),
baths_max=home_info.get("bathrooms"),
sqft_min=int(home_info["livingArea"]) if "livingArea" in home_info else None,
sqft_max=int(home_info["livingArea"]) if "livingArea" in home_info else None,
price_per_sqft=int(home_info["price"] // home_info["livingArea"])
if "livingArea" in home_info and home_info["livingArea"] != 0 and "price" in home_info
else None,
latitude=result["latLong"]["latitude"],
longitude=result["latLong"]["longitude"],
lot_area_value=round(home_info["lotAreaValue"], 2) if "lotAreaValue" in home_info else None,
lot_area_unit=home_info.get("lotAreaUnit"),
img_src=result.get("imgSrc"),
)
properties_list.append(property_obj)
elif "isBuilding" in result:
price_string = result["price"].replace("$", "").replace(",", "").replace("+/mo", "")
match = re.search(r"(\d+)", price_string)
price_value = int(match.group(1)) if match else None
building_obj = Property(
property_url=f"https://www.zillow.com{result['detailUrl']}",
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
listing_type=ListingType(result["statusType"]),
img_src=result.get("imgSrc"),
address=self._extract_address(result["address"]),
baths_min=result.get("minBaths"),
area_min=result.get("minArea"),
bldg_name=result.get("communityName"),
status_text=result.get("statusText"),
price_min=price_value if "+/mo" in result.get("price") else None,
price_max=price_value if "+/mo" in result.get("price") else None,
latitude=result.get("latLong", {}).get("latitude"),
longitude=result.get("latLong", {}).get("longitude"),
unit_count=result.get("unitCount"),
)
properties_list.append(building_obj)
return properties_list
def _get_single_property_page(self, property_data: dict):
"""
This method is used when a user enters the exact location & zillow returns just one property
"""
url = (
f"https://www.zillow.com{property_data['hdpUrl']}"
if "zillow.com" not in property_data["hdpUrl"]
else property_data["hdpUrl"]
)
address_data = property_data["address"]
address_one, address_two = parse_address_one(address_data["streetAddress"])
address = Address(
address_one=address_one,
address_two=address_two if address_two else "#",
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
)
property_type = property_data.get("homeType", None)
return Property(
site_name=self.site_name,
property_url=url,
property_type=PropertyType(property_type) if property_type in PropertyType.__members__ else None,
listing_type=self.listing_type,
address=address,
year_built=property_data.get("yearBuilt"),
tax_assessed_value=property_data.get("taxAssessedValue"),
lot_area_value=property_data.get("lotAreaValue"),
lot_area_unit=property_data["lotAreaUnits"].lower() if "lotAreaUnits" in property_data else None,
agent=Agent(
name=property_data.get("attributionInfo", {}).get("agentName")
),
stories=property_data.get("resoFacts", {}).get("stories"),
mls_id=property_data.get("attributionInfo", {}).get("mlsId"),
beds_min=property_data.get("bedrooms"),
beds_max=property_data.get("bedrooms"),
baths_min=property_data.get("bathrooms"),
baths_max=property_data.get("bathrooms"),
price_min=property_data.get("price"),
price_max=property_data.get("price"),
sqft_min=property_data.get("livingArea"),
sqft_max=property_data.get("livingArea"),
price_per_sqft=property_data.get("resoFacts", {}).get("pricePerSquareFoot"),
latitude=property_data.get("latitude"),
longitude=property_data.get("longitude"),
img_src=property_data.get("streetViewTileImageUrlMediumAddress"),
description=property_data.get("description"),
)
def _extract_address(self, address_str):
"""
Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX',
and return an Address object.
"""
parts = address_str.split(", ")
if len(parts) != 3:
raise ValueError(f"Unexpected address format: {address_str}")
address_one = parts[0].strip()
city = parts[1].strip()
state_zip = parts[2].split(" ")
if len(state_zip) == 1:
state = state_zip[0].strip()
zip_code = None
elif len(state_zip) == 2:
state = state_zip[0].strip()
zip_code = state_zip[1].strip()
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")
address_one, address_two = parse_address_one(address_one)
return Address(
address_one=address_one,
address_two=address_two if address_two else "#",
city=city,
state=state,
zip_code=zip_code,
)

View File

@ -16,3 +16,7 @@ class GeoCoordsNotFound(Exception):
class SearchTooBroad(Exception): class SearchTooBroad(Exception):
"""Raised when the search is too broad""" """Raised when the search is too broad"""
class InvalidTimeFrame(Exception):
"""Raised when the time frame is invalid"""

View File

@ -1,38 +1,76 @@
import re from .core.scrapers.models import Property
import pandas as pd
ordered_properties = [
"PropertyURL",
"MLS",
"MLS #",
"Status",
"Style",
"Street",
"Unit",
"City",
"State",
"Zip",
"Beds",
"FB",
"NumHB",
"EstSF",
"YrBlt",
"ListPrice",
"Lst Date",
"Sold Price",
"COEDate",
"LotSFApx",
"PrcSqft",
"LATITUDE",
"LONGITUDE",
"Stories",
"HOAFee",
"PrkgGar",
"Community",
]
def parse_address_one(street_address: str) -> tuple: def process_result(result: Property) -> pd.DataFrame:
if not street_address: prop_data = {prop: None for prop in ordered_properties}
return street_address, "#" prop_data.update(result.__dict__)
prop_data["PropertyURL"] = prop_data["property_url"]
prop_data["MLS"] = prop_data["mls"]
prop_data["MLS #"] = prop_data["mls_id"]
prop_data["Status"] = prop_data["status"]
prop_data["Style"] = prop_data["style"]
apt_match = re.search( if "address" in prop_data:
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$", address_data = prop_data["address"]
street_address, prop_data["Street"] = address_data.street
re.I, prop_data["Unit"] = address_data.unit
) prop_data["City"] = address_data.city
prop_data["State"] = address_data.state
prop_data["Zip"] = address_data.zip
if apt_match: prop_data["Community"] = prop_data["neighborhoods"]
apt_str = apt_match.group().strip() prop_data["Beds"] = prop_data["beds"]
cleaned_apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I) prop_data["FB"] = prop_data["baths_full"]
prop_data["NumHB"] = prop_data["baths_half"]
prop_data["EstSF"] = prop_data["est_sf"]
prop_data["ListPrice"] = prop_data["list_price"]
prop_data["Lst Date"] = prop_data["list_date"]
prop_data["Sold Price"] = prop_data["sold_price"]
prop_data["COEDate"] = prop_data["last_sold_date"]
prop_data["LotSFApx"] = prop_data["lot_sf"]
prop_data["HOAFee"] = prop_data["hoa_fee"]
main_address = street_address.replace(apt_str, "").strip() if prop_data.get("prc_sqft") is not None:
return main_address, cleaned_apt_str prop_data["PrcSqft"] = round(prop_data["prc_sqft"], 2)
else:
return street_address, "#"
prop_data["YrBlt"] = prop_data["yr_blt"]
prop_data["LATITUDE"] = prop_data["latitude"]
prop_data["LONGITUDE"] = prop_data["longitude"]
prop_data["Stories"] = prop_data["stories"]
prop_data["PrkgGar"] = prop_data["prkg_gar"]
def parse_address_two(street_address: str): properties_df = pd.DataFrame([prop_data])
if not street_address: properties_df = properties_df.reindex(columns=ordered_properties)
return "#"
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
street_address,
re.I,
)
if apt_match: return properties_df[ordered_properties]
apt_str = apt_match.group().strip()
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I)
return apt_str
else:
return "#"

View File

@ -12,13 +12,13 @@ def test_realtor():
scrape_property( scrape_property(
location="2530 Al Lipscomb Way", location="2530 Al Lipscomb Way",
site_name="realtor.com", site_name="realtor.com",
listing_type="for_sale", status="for_sale",
), ),
scrape_property( scrape_property(
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent" location="Phoenix, AZ", site_name=["realtor.com"], status="for_rent"
), #: does not support "city, state, USA" format ), #: does not support "city, state, USA" format
scrape_property( scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold" location="Dallas, TX", site_name="realtor.com", status="sold"
), #: does not support "city, state, USA" format ), #: does not support "city, state, USA" format
scrape_property(location="85281", site_name="realtor.com"), scrape_property(location="85281", site_name="realtor.com"),
] ]
@ -31,7 +31,7 @@ def test_realtor():
scrape_property( scrape_property(
location="abceefg ju098ot498hh9", location="abceefg ju098ot498hh9",
site_name="realtor.com", site_name="realtor.com",
listing_type="for_sale", status="sale",
) )
] ]
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound): except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound):

View File

@ -10,10 +10,14 @@ from homeharvest.exceptions import (
def test_redfin(): def test_redfin():
results = [ results = [
scrape_property(location="San Diego", site_name="redfin", listing_type="for_sale"), scrape_property(location="San Diego", site_name="redfin", status="sale"),
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"), location="2530 Al Lipscomb Way", site_name="redfin", status="sale"
scrape_property(location="Dallas, TX, USA", site_name="redfin", listing_type="sold"), ),
scrape_property(
location="Phoenix, AZ, USA", site_name=["redfin"], status="rent"
),
scrape_property(location="Dallas, TX, USA", site_name="redfin", status="sold"),
scrape_property(location="85281", site_name="redfin"), scrape_property(location="85281", site_name="redfin"),
] ]
@ -25,11 +29,17 @@ def test_redfin():
scrape_property( scrape_property(
location="abceefg ju098ot498hh9", location="abceefg ju098ot498hh9",
site_name="redfin", site_name="redfin",
listing_type="for_sale", status="sale",
), ),
scrape_property(location="Florida", site_name="redfin", listing_type="for_rent"), scrape_property(location="Florida", site_name="redfin", status="for_rent"),
] ]
except (InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound, SearchTooBroad): except (
InvalidSite,
InvalidListingType,
NoResultsFound,
GeoCoordsNotFound,
SearchTooBroad,
):
assert True assert True
assert all([result is None for result in bad_results]) assert all([result is None for result in bad_results])

View File

@ -17,7 +17,13 @@ def test_parse_address_one():
def test_parse_address_two(): def test_parse_address_two():
test_data = [("Apt 126", "#126"), ("apt 2B", "#2B"), ("UNIT 3A", "#3A"), ("unit 3A", "#3A"), ("SuIte 3A", "#3A")] test_data = [
("Apt 126", "#126"),
("apt 2B", "#2B"),
("UNIT 3A", "#3A"),
("unit 3A", "#3A"),
("SuIte 3A", "#3A"),
]
for input_data, expected in test_data: for input_data, expected in test_data:
output = parse_address_two(input_data) output = parse_address_two(input_data)

View File

@ -9,12 +9,24 @@ from homeharvest.exceptions import (
def test_zillow(): def test_zillow():
results = [ results = [
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"), scrape_property(
scrape_property(location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"), location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
scrape_property(location="Surprise, AZ", site_name=["zillow"], listing_type="for_sale"), ),
scrape_property(location="Dallas, TX, USA", site_name="zillow", listing_type="sold"), scrape_property(
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
),
scrape_property(
location="Surprise, AZ", site_name=["zillow"], listing_type="for_sale"
),
scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
),
scrape_property(location="85281", site_name="zillow"), scrape_property(location="85281", site_name="zillow"),
scrape_property(location="3268 88th st s, Lakewood", site_name="zillow", listing_type="for_rent"), scrape_property(
location="3268 88th st s, Lakewood",
site_name="zillow",
listing_type="for_rent",
),
] ]
assert all([result is not None for result in results]) assert all([result is not None for result in results])