mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 12:04:31 -08:00
Compare commits
122 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ba7ad069c9 | ||
|
|
22bda972b0 | ||
|
|
6f5bbf79a4 | ||
|
|
608cceba34 | ||
|
|
3609586995 | ||
|
|
68c7e411e4 | ||
|
|
5e825601a7 | ||
|
|
ce3f94d0af | ||
|
|
4a1116440d | ||
|
|
2d092c595f | ||
|
|
4dbb064fe9 | ||
|
|
4e78248032 | ||
|
|
37e20f4469 | ||
|
|
8a5f0dc2c9 | ||
|
|
de692faae2 | ||
|
|
6bb68766fc | ||
|
|
446d5488b8 | ||
|
|
68e15ce696 | ||
|
|
c4870677c2 | ||
|
|
51bde20c3c | ||
|
|
f8c0dd766d | ||
|
|
f06a01678c | ||
|
|
d2879734e6 | ||
|
|
bf81ef413f | ||
|
|
29664e4eee | ||
|
|
088088ae51 | ||
|
|
40bbf76db1 | ||
|
|
1f1ca8068f | ||
|
|
8388d47f73 | ||
|
|
ba503b0ca3 | ||
|
|
8962d619e1 | ||
|
|
3b7c17b7b5 | ||
|
|
59317fd6fc | ||
|
|
928b431d1f | ||
|
|
896f862137 | ||
|
|
3174f5076c | ||
|
|
2abbb913a8 | ||
|
|
73b6d5b33f | ||
|
|
da39c989d9 | ||
|
|
01c53f9399 | ||
|
|
9200c17df2 | ||
|
|
9e262bf214 | ||
|
|
82f78fb578 | ||
|
|
b0e40df00a | ||
|
|
2fc40e0dad | ||
|
|
254f3a68a1 | ||
|
|
05713c76b0 | ||
|
|
9120cc9bfe | ||
|
|
eee4b19515 | ||
|
|
c25961eded | ||
|
|
0884c3d163 | ||
|
|
8f37bfdeb8 | ||
|
|
48c2338276 | ||
|
|
f58a1f4a74 | ||
|
|
4cef926d7d | ||
|
|
e82eeaa59f | ||
|
|
644f16b25b | ||
|
|
e9ddc6df92 | ||
|
|
50fb1c391d | ||
|
|
4f91f9dadb | ||
|
|
66e55173b1 | ||
|
|
f6054e8746 | ||
|
|
e8d9235ee6 | ||
|
|
043f091158 | ||
|
|
eae8108978 | ||
|
|
0a39357a07 | ||
|
|
8f06d46ddb | ||
|
|
0dae14ccfc | ||
|
|
9aaabdd5d8 | ||
|
|
cdf41fe9f2 | ||
|
|
1f0feb836d | ||
|
|
5f31beda46 | ||
|
|
fd9cdea499 | ||
|
|
93a1cbe17f | ||
|
|
49d27943c4 | ||
|
|
05fca9b7e6 | ||
|
|
20ce44fb3a | ||
|
|
52017c1bb5 | ||
|
|
dba1c03081 | ||
|
|
1fc2d8c549 | ||
|
|
02d112eea0 | ||
|
|
30e510882b | ||
|
|
78b56c2cac | ||
|
|
087854a688 | ||
|
|
80586467a8 | ||
|
|
3494b152b8 | ||
|
|
6c6fef80ed | ||
|
|
62e3321277 | ||
|
|
80186ee8c5 | ||
|
|
3ec47c5b6a | ||
|
|
42e8ac4de9 | ||
|
|
e1917009ae | ||
|
|
7297f0eb33 | ||
|
|
2eec389838 | ||
|
|
b01162161d | ||
|
|
906ce92685 | ||
|
|
cc76e067b2 | ||
|
|
1f0c351974 | ||
|
|
a1684f87db | ||
|
|
2ae3ebe28e | ||
|
|
ae3961514b | ||
|
|
0621b01d9a | ||
|
|
fbbd56d930 | ||
|
|
82092faa28 | ||
|
|
8f90a80b0a | ||
|
|
d5b4d80f96 | ||
|
|
086bcfd224 | ||
|
|
4726764482 | ||
|
|
ca260fd2b4 | ||
|
|
94e5b090da | ||
|
|
d0a6a66b6a | ||
|
|
8e140a0e45 | ||
|
|
588689c230 | ||
|
|
c7a4bfd5e4 | ||
|
|
fe351ab57c | ||
|
|
5d0f519a85 | ||
|
|
869d7e7c51 | ||
|
|
ffd3ce6aed | ||
|
|
471e53118e | ||
|
|
dc8c15959f | ||
|
|
10c01f373e | ||
|
|
fd01bfb8b8 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -3,4 +3,5 @@
|
||||
**/__pycache__/
|
||||
**/.pytest_cache/
|
||||
*.pyc
|
||||
/.ipynb_checkpoints/
|
||||
/.ipynb_checkpoints/
|
||||
*.csv
|
||||
192
README.md
192
README.md
@@ -1,33 +1,193 @@
|
||||
# HomeHarvest
|
||||
<img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
|
||||
|
||||
**HomeHarvest** aims to be the top Python real estate scraping library.
|
||||
**HomeHarvest** is a simple, yet comprehensive, real estate scraping library that extracts and formats data in the style of MLS listings.
|
||||
|
||||
_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._
|
||||
[](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
|
||||
|
||||
\
|
||||
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
|
||||
|
||||
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
||||
|
||||
Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/JobSpy)** – a Python package for job scraping*
|
||||
|
||||
## HomeHarvest Features
|
||||
|
||||
- **Source**: Fetches properties directly from **Realtor.com**.
|
||||
- **Data Format**: Structures data to resemble MLS listings.
|
||||
- **Export Flexibility**: Options to save as either CSV or Excel.
|
||||
- **Usage Modes**:
|
||||
- **Python**: For those who'd like to integrate scraping into their Python scripts.
|
||||
- **CLI**: For users who prefer command-line operations.
|
||||
|
||||
|
||||
[Video Guide for HomeHarvest](https://youtu.be/JnV7eR2Ve2o) - _updated for release v0.2.7_
|
||||
|
||||

|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install --upgrade homeharvest
|
||||
pip install homeharvest
|
||||
```
|
||||
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
||||
|
||||
## Example Usage
|
||||
```
|
||||
## Usage
|
||||
|
||||
### Python
|
||||
|
||||
```py
|
||||
from homeharvest import scrape_property
|
||||
from datetime import datetime
|
||||
|
||||
# Generate filename based on current timestamp
|
||||
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"HomeHarvest_{current_timestamp}.csv"
|
||||
|
||||
properties = scrape_property(
|
||||
location="85281", site_name="zillow", listing_type="for_rent"
|
||||
location="San Diego, CA",
|
||||
listing_type="sold", # or (for_sale, for_rent)
|
||||
property_younger_than=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
|
||||
# pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
|
||||
# mls_only=True, # only fetch MLS listings
|
||||
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
|
||||
)
|
||||
print(properties)
|
||||
print(f"Number of properties: {len(properties)}")
|
||||
|
||||
# Export to csv
|
||||
properties.to_csv(filename, index=False)
|
||||
print(properties.head())
|
||||
```
|
||||
|
||||
### Site Name Options
|
||||
### CLI
|
||||
|
||||
- `zillow`
|
||||
- `redfin`
|
||||
- `realtor.com`
|
||||
```
|
||||
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location
|
||||
|
||||
Home Harvest Property Scraper
|
||||
|
||||
positional arguments:
|
||||
location Location to scrape (e.g., San Francisco, CA)
|
||||
|
||||
options:
|
||||
-l {for_sale,for_rent,sold}, --listing_type {for_sale,for_rent,sold}
|
||||
Listing type to scrape
|
||||
-o {excel,csv}, --output {excel,csv}
|
||||
Output format
|
||||
-f FILENAME, --filename FILENAME
|
||||
Name of the output file (without extension)
|
||||
-p PROXY, --proxy PROXY
|
||||
Proxy to use for scraping
|
||||
-d DAYS, --days DAYS Sold/listed in last _ days filter.
|
||||
-r RADIUS, --radius RADIUS
|
||||
Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses.
|
||||
-m, --mls_only If set, fetches only MLS listings.
|
||||
-c, --pending_or_contingent
|
||||
If set, fetches only pending or contingent listings. Only applicable for for_sale listings from general area searches.
|
||||
|
||||
### Listing Types
|
||||
```
|
||||
```bash
|
||||
> homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
|
||||
```
|
||||
|
||||
|
||||
## Output
|
||||
```plaintext
|
||||
>>> properties.head()
|
||||
MLS MLS # Status Style ... COEDate LotSFApx PrcSqft Stories
|
||||
0 SDCA 230018348 SOLD CONDOS ... 2023-10-03 290110 803 2
|
||||
1 SDCA 230016614 SOLD TOWNHOMES ... 2023-10-03 None 838 3
|
||||
2 SDCA 230016367 SOLD CONDOS ... 2023-10-03 30056 649 1
|
||||
3 MRCA NDP2306335 SOLD SINGLE_FAMILY ... 2023-10-03 7519 661 2
|
||||
4 SDCA 230014532 SOLD CONDOS ... 2023-10-03 None 752 1
|
||||
[5 rows x 22 columns]
|
||||
```
|
||||
|
||||
### Parameters for `scrape_property()`
|
||||
```
|
||||
Required
|
||||
├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.
|
||||
└── listing_type (option): Choose the type of listing.
|
||||
- 'for_rent'
|
||||
- 'for_sale'
|
||||
- 'sold'
|
||||
|
||||
Optional
|
||||
├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
|
||||
│ Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
|
||||
│
|
||||
├── property_younger_than (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).
|
||||
│ Example: 30 (fetches properties listed/sold in the last 30 days)
|
||||
|
|
||||
├── pending_or_contingent (True/False): If set, fetches only pending or contingent listings. Only applicable for `for_sale listings` from general area searches.
|
||||
│
|
||||
├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
|
||||
│
|
||||
└── proxy (string): In format 'http://user:pass@host:port'
|
||||
|
||||
```
|
||||
### Property Schema
|
||||
```plaintext
|
||||
Property
|
||||
├── Basic Information:
|
||||
│ ├── property_url
|
||||
│ ├── mls
|
||||
│ ├── mls_id
|
||||
│ └── status
|
||||
|
||||
├── Address Details:
|
||||
│ ├── street
|
||||
│ ├── unit
|
||||
│ ├── city
|
||||
│ ├── state
|
||||
│ └── zip_code
|
||||
|
||||
├── Property Description:
|
||||
│ ├── style
|
||||
│ ├── beds
|
||||
│ ├── full_baths
|
||||
│ ├── half_baths
|
||||
│ ├── sqft
|
||||
│ ├── year_built
|
||||
│ ├── stories
|
||||
│ └── lot_sqft
|
||||
|
||||
├── Property Listing Details:
|
||||
│ ├── list_price
|
||||
│ ├── list_date
|
||||
│ ├── sold_price
|
||||
│ ├── last_sold_date
|
||||
│ ├── price_per_sqft
|
||||
│ └── hoa_fee
|
||||
|
||||
├── Location Details:
|
||||
│ ├── latitude
|
||||
│ ├── longitude
|
||||
|
||||
└── Parking Details:
|
||||
└── parking_garage
|
||||
```
|
||||
|
||||
### Exceptions
|
||||
The following exceptions may be raised when using HomeHarvest:
|
||||
|
||||
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
|
||||
- `NoResultsFound` - no properties found from your search
|
||||
|
||||
|
||||
## Frequently Asked Questions
|
||||
---
|
||||
|
||||
**Q: Encountering issues with your searches?**
|
||||
**A:** Try to broaden the parameters you're using. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues).
|
||||
|
||||
---
|
||||
|
||||
**Q: Received a Forbidden 403 response code?**
|
||||
**A:** This indicates that you have been blocked by Realtor.com for sending too many requests. We recommend:
|
||||
|
||||
- Waiting a few seconds between requests.
|
||||
- Trying a VPN or useing a proxy as a parameter to scrape_property() to change your IP address.
|
||||
|
||||
---
|
||||
|
||||
- `for_rent`
|
||||
- `for_sale`
|
||||
- `sold`
|
||||
|
||||
@@ -31,8 +31,29 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# check for sale properties\n",
|
||||
"scrape_property(\n",
|
||||
" location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n",
|
||||
" location=\"dallas\",\n",
|
||||
" listing_type=\"for_sale\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aaf86093",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# search a specific address\n",
|
||||
"scrape_property(\n",
|
||||
" location=\"2530 Al Lipscomb Way\",\n",
|
||||
" listing_type=\"for_sale\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -43,8 +64,29 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# check rentals\n",
|
||||
"scrape_property(\n",
|
||||
" location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n",
|
||||
" location=\"chicago, illinois\",\n",
|
||||
" listing_type=\"for_rent\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "af280cd3",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# check sold properties\n",
|
||||
"scrape_property(\n",
|
||||
" location=\"90210\",\n",
|
||||
" listing_type=\"sold\"\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
20
examples/HomeHarvest_Demo.py
Normal file
20
examples/HomeHarvest_Demo.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from homeharvest import scrape_property
|
||||
from datetime import datetime
|
||||
|
||||
# Generate filename based on current timestamp
|
||||
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"HomeHarvest_{current_timestamp}.csv"
|
||||
|
||||
properties = scrape_property(
|
||||
location="San Diego, CA",
|
||||
listing_type="sold", # or (for_sale, for_rent)
|
||||
property_younger_than=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
|
||||
# pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
|
||||
# mls_only=True, # only fetch MLS listings
|
||||
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
|
||||
)
|
||||
print(f"Number of properties: {len(properties)}")
|
||||
|
||||
# Export to csv
|
||||
properties.to_csv(filename, index=False)
|
||||
print(properties.head())
|
||||
@@ -1,117 +1,50 @@
|
||||
from .core.scrapers.redfin import RedfinScraper
|
||||
from .core.scrapers.realtor import RealtorScraper
|
||||
from .core.scrapers.zillow import ZillowScraper
|
||||
from .core.scrapers.models import ListingType, Property, Building, SiteName
|
||||
from .core.scrapers import ScraperInput
|
||||
from .exceptions import InvalidSite, InvalidListingType
|
||||
from typing import Union
|
||||
import warnings
|
||||
import pandas as pd
|
||||
|
||||
|
||||
_scrapers = {
|
||||
"redfin": RedfinScraper,
|
||||
"realtor.com": RealtorScraper,
|
||||
"zillow": ZillowScraper,
|
||||
}
|
||||
|
||||
|
||||
def validate_input(site_name: str, listing_type: str) -> None:
|
||||
if site_name.lower() not in _scrapers:
|
||||
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
|
||||
|
||||
if listing_type.upper() not in ListingType.__members__:
|
||||
raise InvalidListingType(
|
||||
f"Provided listing type, '{listing_type}', does not exist."
|
||||
)
|
||||
|
||||
|
||||
def get_ordered_properties(result: Union[Building, Property]) -> list[str]:
|
||||
if isinstance(result, Property):
|
||||
return [
|
||||
"listing_type",
|
||||
"address_one",
|
||||
"city",
|
||||
"state",
|
||||
"zip_code",
|
||||
"address_two",
|
||||
"url",
|
||||
"property_type",
|
||||
"price",
|
||||
"beds",
|
||||
"baths",
|
||||
"square_feet",
|
||||
"price_per_square_foot",
|
||||
"lot_size",
|
||||
"stories",
|
||||
"year_built",
|
||||
"agent_name",
|
||||
"mls_id",
|
||||
"description",
|
||||
]
|
||||
elif isinstance(result, Building):
|
||||
return [
|
||||
"address_one",
|
||||
"city",
|
||||
"state",
|
||||
"zip_code",
|
||||
"address_two",
|
||||
"url",
|
||||
"num_units",
|
||||
"min_unit_price",
|
||||
"max_unit_price",
|
||||
"avg_unit_price",
|
||||
"listing_type",
|
||||
]
|
||||
return []
|
||||
|
||||
|
||||
def process_result(result: Union[Building, Property]) -> pd.DataFrame:
|
||||
prop_data = result.__dict__
|
||||
|
||||
address_data = prop_data["address"]
|
||||
prop_data["site_name"] = prop_data["site_name"]
|
||||
prop_data["listing_type"] = prop_data["listing_type"].value
|
||||
prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data.get("property_type") else None
|
||||
prop_data["address_one"] = address_data.address_one
|
||||
prop_data["city"] = address_data.city
|
||||
prop_data["state"] = address_data.state
|
||||
prop_data["zip_code"] = address_data.zip_code
|
||||
prop_data["address_two"] = address_data.address_two
|
||||
|
||||
del prop_data["address"]
|
||||
|
||||
properties_df = pd.DataFrame([prop_data])
|
||||
properties_df = properties_df[get_ordered_properties(result)]
|
||||
|
||||
return properties_df
|
||||
from .core.scrapers import ScraperInput
|
||||
from .utils import process_result, ordered_properties, validate_input
|
||||
from .core.scrapers.realtor import RealtorScraper
|
||||
from .core.scrapers.models import ListingType
|
||||
from .exceptions import InvalidListingType, NoResultsFound
|
||||
|
||||
|
||||
def scrape_property(
|
||||
location: str,
|
||||
site_name: str,
|
||||
listing_type: str = "for_sale", #: for_sale, for_rent, sold
|
||||
listing_type: str = "for_sale",
|
||||
radius: float = None,
|
||||
mls_only: bool = False,
|
||||
property_younger_than: int = None,
|
||||
pending_or_contingent: bool = False,
|
||||
proxy: str = None,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Scrape property from various sites from a given location and listing type.
|
||||
|
||||
:returns: pd.DataFrame
|
||||
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
|
||||
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
|
||||
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
|
||||
:return: pd.DataFrame containing properties
|
||||
Scrape properties from Realtor.com based on a given location and listing type.
|
||||
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
|
||||
:param listing_type: Listing Type (for_sale, for_rent, sold)
|
||||
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
|
||||
:param mls_only: If set, fetches only listings with MLS IDs.
|
||||
:param property_younger_than: Get properties sold/listed in last _ days.
|
||||
:param pending_or_contingent: If set, fetches only pending or contingent listings. Only applicable for for_sale listings from general area searches.
|
||||
:param proxy: Proxy to use for scraping
|
||||
"""
|
||||
|
||||
validate_input(site_name, listing_type)
|
||||
validate_input(listing_type)
|
||||
|
||||
scraper_input = ScraperInput(
|
||||
location=location,
|
||||
listing_type=ListingType[listing_type.upper()],
|
||||
site_name=site_name.lower(),
|
||||
proxy=proxy,
|
||||
radius=radius,
|
||||
mls_only=mls_only,
|
||||
last_x_days=property_younger_than,
|
||||
pending_or_contingent=pending_or_contingent,
|
||||
)
|
||||
|
||||
site = _scrapers[site_name.lower()](scraper_input)
|
||||
site = RealtorScraper(scraper_input)
|
||||
results = site.search()
|
||||
|
||||
properties_dfs = [process_result(result) for result in results]
|
||||
if not properties_dfs:
|
||||
raise NoResultsFound("no results found for the query")
|
||||
|
||||
return pd.concat(properties_dfs, ignore_index=True)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", category=FutureWarning)
|
||||
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
|
||||
|
||||
97
homeharvest/cli.py
Normal file
97
homeharvest/cli.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import argparse
|
||||
import datetime
|
||||
from homeharvest import scrape_property
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
|
||||
parser.add_argument(
|
||||
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--listing_type",
|
||||
type=str,
|
||||
default="for_sale",
|
||||
choices=["for_sale", "for_rent", "sold"],
|
||||
help="Listing type to scrape",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
type=str,
|
||||
default="excel",
|
||||
choices=["excel", "csv"],
|
||||
help="Output format",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--filename",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Name of the output file (without extension)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--days",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Sold/listed in last _ days filter.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--radius",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--mls_only",
|
||||
action="store_true",
|
||||
help="If set, fetches only MLS listings.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--pending_or_contingent",
|
||||
action="store_true",
|
||||
help="If set, fetches only pending or contingent listings. Only applicable for for_sale listings from general area searches.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
result = scrape_property(
|
||||
args.location,
|
||||
args.listing_type,
|
||||
radius=args.radius,
|
||||
proxy=args.proxy,
|
||||
mls_only=args.mls_only,
|
||||
property_younger_than=args.days,
|
||||
pending_or_contingent=args.pending_or_contingent,
|
||||
)
|
||||
|
||||
if not args.filename:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
args.filename = f"HomeHarvest_{timestamp}"
|
||||
|
||||
if args.output == "excel":
|
||||
output_filename = f"{args.filename}.xlsx"
|
||||
result.to_excel(output_filename, index=False)
|
||||
print(f"Excel file saved as {output_filename}")
|
||||
elif args.output == "csv":
|
||||
output_filename = f"{args.filename}.csv"
|
||||
result.to_csv(output_filename, index=False)
|
||||
print(f"CSV file saved as {output_filename}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,5 +1,6 @@
|
||||
from dataclasses import dataclass
|
||||
import requests
|
||||
import tls_client
|
||||
from .models import Property, ListingType, SiteName
|
||||
|
||||
|
||||
@@ -7,24 +8,37 @@ from .models import Property, ListingType, SiteName
|
||||
class ScraperInput:
|
||||
location: str
|
||||
listing_type: ListingType
|
||||
site_name: str
|
||||
proxy_url: str | None = None
|
||||
radius: float | None = None
|
||||
mls_only: bool | None = None
|
||||
proxy: str | None = None
|
||||
last_x_days: int | None = None
|
||||
pending_or_contingent: bool | None = None
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, scraper_input: ScraperInput):
|
||||
def __init__(
|
||||
self,
|
||||
scraper_input: ScraperInput,
|
||||
session: requests.Session | tls_client.Session = None,
|
||||
):
|
||||
self.location = scraper_input.location
|
||||
self.listing_type = scraper_input.listing_type
|
||||
|
||||
self.session = requests.Session()
|
||||
self.listing_type = scraper_input.listing_type
|
||||
self.site_name = scraper_input.site_name
|
||||
if not session:
|
||||
self.session = requests.Session()
|
||||
else:
|
||||
self.session = session
|
||||
|
||||
if scraper_input.proxy_url:
|
||||
self.session.proxies = {
|
||||
"http": scraper_input.proxy_url,
|
||||
"https": scraper_input.proxy_url,
|
||||
}
|
||||
if scraper_input.proxy:
|
||||
proxy_url = scraper_input.proxy
|
||||
proxies = {"http": proxy_url, "https": proxy_url}
|
||||
self.session.proxies.update(proxies)
|
||||
|
||||
self.listing_type = scraper_input.listing_type
|
||||
self.radius = scraper_input.radius
|
||||
self.last_x_days = scraper_input.last_x_days
|
||||
self.mls_only = scraper_input.mls_only
|
||||
self.pending_or_contingent = scraper_input.pending_or_contingent
|
||||
|
||||
def search(self) -> list[Property]:
|
||||
...
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class SiteName(Enum):
|
||||
@@ -7,78 +8,58 @@ class SiteName(Enum):
|
||||
REDFIN = "redfin"
|
||||
REALTOR = "realtor.com"
|
||||
|
||||
@classmethod
|
||||
def get_by_value(cls, value):
|
||||
for item in cls:
|
||||
if item.value == value:
|
||||
return item
|
||||
raise ValueError(f"{value} not found in {cls}")
|
||||
|
||||
|
||||
class ListingType(Enum):
|
||||
FOR_SALE = "for_sale"
|
||||
FOR_RENT = "for_rent"
|
||||
SOLD = "sold"
|
||||
|
||||
|
||||
class PropertyType(Enum):
|
||||
HOUSE = "HOUSE"
|
||||
CONDO = "CONDO"
|
||||
TOWNHOUSE = "TOWNHOUSE"
|
||||
SINGLE_FAMILY = "SINGLE_FAMILY"
|
||||
MULTI_FAMILY = "MULTI_FAMILY"
|
||||
MANUFACTURED = "MANUFACTURED"
|
||||
APARTMENT = "APARTMENT"
|
||||
LAND = "LAND"
|
||||
OTHER = "OTHER"
|
||||
|
||||
@classmethod
|
||||
def from_int_code(cls, code):
|
||||
mapping = {
|
||||
1: cls.HOUSE,
|
||||
2: cls.CONDO,
|
||||
3: cls.TOWNHOUSE,
|
||||
4: cls.MULTI_FAMILY,
|
||||
5: cls.LAND,
|
||||
6: cls.OTHER,
|
||||
8: cls.SINGLE_FAMILY,
|
||||
13: cls.SINGLE_FAMILY,
|
||||
}
|
||||
|
||||
return mapping.get(code, cls.OTHER)
|
||||
FOR_SALE = "FOR_SALE"
|
||||
FOR_RENT = "FOR_RENT"
|
||||
SOLD = "SOLD"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Address:
|
||||
address_one: str
|
||||
city: str
|
||||
state: str
|
||||
zip_code: str
|
||||
|
||||
address_two: str | None = None
|
||||
|
||||
|
||||
@dataclass()
|
||||
class Realty:
|
||||
site_name: str
|
||||
address: Address
|
||||
url: str
|
||||
listing_type: ListingType | None = None
|
||||
street: str | None = None
|
||||
unit: str | None = None
|
||||
city: str | None = None
|
||||
state: str | None = None
|
||||
zip: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Property(Realty):
|
||||
price: int | None = None
|
||||
class Description:
|
||||
style: str | None = None
|
||||
beds: int | None = None
|
||||
baths: float | None = None
|
||||
stories: int | None = None
|
||||
baths_full: int | None = None
|
||||
baths_half: int | None = None
|
||||
sqft: int | None = None
|
||||
lot_sqft: int | None = None
|
||||
sold_price: int | None = None
|
||||
year_built: int | None = None
|
||||
square_feet: int | None = None
|
||||
price_per_square_foot: int | None = None
|
||||
mls_id: str | None = None
|
||||
|
||||
agent_name: str | None = None
|
||||
property_type: PropertyType | None = None
|
||||
lot_size: int | None = None
|
||||
description: str | None = None
|
||||
garage: float | None = None
|
||||
stories: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Building(Realty):
|
||||
num_units: int | None = None
|
||||
min_unit_price: int | None = None
|
||||
max_unit_price: int | None = None
|
||||
avg_unit_price: int | None = None
|
||||
class Property:
|
||||
property_url: str
|
||||
mls: str | None = None
|
||||
mls_id: str | None = None
|
||||
status: str | None = None
|
||||
address: Address | None = None
|
||||
|
||||
list_price: int | None = None
|
||||
list_date: str | None = None
|
||||
last_sold_date: str | None = None
|
||||
prc_sqft: int | None = None
|
||||
hoa_fee: int | None = None
|
||||
description: Description | None = None
|
||||
|
||||
latitude: float | None = None
|
||||
longitude: float | None = None
|
||||
neighborhoods: Optional[str] = None
|
||||
|
||||
@@ -1,54 +1,195 @@
|
||||
import json
|
||||
from ..models import Property, Address
|
||||
from .. import Scraper
|
||||
from typing import Any, Generator
|
||||
from ....exceptions import NoResultsFound
|
||||
"""
|
||||
homeharvest.realtor.__init__
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This module implements the scraper for realtor.com
|
||||
"""
|
||||
from typing import Dict, Union, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from .. import Scraper
|
||||
from ....exceptions import NoResultsFound
|
||||
from ..models import Property, Address, ListingType, Description
|
||||
|
||||
|
||||
class RealtorScraper(Scraper):
|
||||
SEARCH_GQL_URL = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
|
||||
PROPERTY_URL = "https://www.realtor.com/realestateandhomes-detail/"
|
||||
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
|
||||
|
||||
def __init__(self, scraper_input):
|
||||
self.counter = 1
|
||||
super().__init__(scraper_input)
|
||||
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
|
||||
|
||||
def handle_location(self):
|
||||
headers = {
|
||||
"authority": "parser-external.geo.moveaws.com",
|
||||
"accept": "*/*",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"origin": "https://www.realtor.com",
|
||||
"referer": "https://www.realtor.com/",
|
||||
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
params = {
|
||||
"input": self.location,
|
||||
"client_id": self.listing_type.value.replace('_', '-'),
|
||||
"client_id": self.listing_type.value.lower().replace("_", "-"),
|
||||
"limit": "1",
|
||||
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
|
||||
}
|
||||
|
||||
response = self.session.get(
|
||||
"https://parser-external.geo.moveaws.com/suggest",
|
||||
self.ADDRESS_AUTOCOMPLETE_URL,
|
||||
params=params,
|
||||
headers=headers,
|
||||
)
|
||||
response_json = response.json()
|
||||
|
||||
result = response_json["autocomplete"]
|
||||
|
||||
if result is None:
|
||||
if not result:
|
||||
raise NoResultsFound("No results found for location: " + self.location)
|
||||
|
||||
return result[0]
|
||||
|
||||
def handle_listing(self, listing_id: str) -> list[Property]:
|
||||
query = """query Listing($listing_id: ID!) {
|
||||
listing(id: $listing_id) {
|
||||
source {
|
||||
id
|
||||
listing_id
|
||||
}
|
||||
address {
|
||||
street_number
|
||||
street_name
|
||||
street_suffix
|
||||
unit
|
||||
city
|
||||
state_code
|
||||
postal_code
|
||||
location {
|
||||
coordinate {
|
||||
lat
|
||||
lon
|
||||
}
|
||||
}
|
||||
}
|
||||
basic {
|
||||
sqft
|
||||
beds
|
||||
baths_full
|
||||
baths_half
|
||||
lot_sqft
|
||||
sold_price
|
||||
sold_price
|
||||
type
|
||||
price
|
||||
status
|
||||
sold_date
|
||||
list_date
|
||||
}
|
||||
details {
|
||||
year_built
|
||||
stories
|
||||
garage
|
||||
permalink
|
||||
}
|
||||
}
|
||||
}"""
|
||||
|
||||
variables = {"listing_id": listing_id}
|
||||
payload = {
|
||||
"query": query,
|
||||
"variables": variables,
|
||||
}
|
||||
|
||||
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
|
||||
response_json = response.json()
|
||||
|
||||
property_info = response_json["data"]["listing"]
|
||||
|
||||
mls = (
|
||||
property_info["source"].get("id")
|
||||
if "source" in property_info and isinstance(property_info["source"], dict)
|
||||
else None
|
||||
)
|
||||
|
||||
able_to_get_lat_long = (
|
||||
property_info
|
||||
and property_info.get("address")
|
||||
and property_info["address"].get("location")
|
||||
and property_info["address"]["location"].get("coordinate")
|
||||
)
|
||||
|
||||
listing = Property(
|
||||
mls=mls,
|
||||
mls_id=property_info["source"].get("listing_id")
|
||||
if "source" in property_info and isinstance(property_info["source"], dict)
|
||||
else None,
|
||||
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
|
||||
status=property_info["basic"]["status"].upper(),
|
||||
list_price=property_info["basic"]["price"],
|
||||
list_date=property_info["basic"]["list_date"].split("T")[0]
|
||||
if property_info["basic"].get("list_date")
|
||||
else None,
|
||||
prc_sqft=property_info["basic"].get("price")
|
||||
/ property_info["basic"].get("sqft")
|
||||
if property_info["basic"].get("price")
|
||||
and property_info["basic"].get("sqft")
|
||||
else None,
|
||||
last_sold_date=property_info["basic"]["sold_date"].split("T")[0]
|
||||
if property_info["basic"].get("sold_date")
|
||||
else None,
|
||||
latitude=property_info["address"]["location"]["coordinate"].get("lat")
|
||||
if able_to_get_lat_long
|
||||
else None,
|
||||
longitude=property_info["address"]["location"]["coordinate"].get("lon")
|
||||
if able_to_get_lat_long
|
||||
else None,
|
||||
address=self._parse_address(property_info, search_type="handle_listing"),
|
||||
description=Description(
|
||||
style=property_info["basic"].get("type", "").upper(),
|
||||
beds=property_info["basic"].get("beds"),
|
||||
baths_full=property_info["basic"].get("baths_full"),
|
||||
baths_half=property_info["basic"].get("baths_half"),
|
||||
sqft=property_info["basic"].get("sqft"),
|
||||
lot_sqft=property_info["basic"].get("lot_sqft"),
|
||||
sold_price=property_info["basic"].get("sold_price"),
|
||||
year_built=property_info["details"].get("year_built"),
|
||||
garage=property_info["details"].get("garage"),
|
||||
stories=property_info["details"].get("stories"),
|
||||
),
|
||||
)
|
||||
|
||||
return [listing]
|
||||
|
||||
def get_latest_listing_id(self, property_id: str) -> str | None:
|
||||
query = """query Property($property_id: ID!) {
|
||||
property(id: $property_id) {
|
||||
listings {
|
||||
listing_id
|
||||
primary
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
variables = {"property_id": property_id}
|
||||
payload = {
|
||||
"query": query,
|
||||
"variables": variables,
|
||||
}
|
||||
|
||||
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
|
||||
response_json = response.json()
|
||||
|
||||
property_info = response_json["data"]["property"]
|
||||
if property_info["listings"] is None:
|
||||
return None
|
||||
|
||||
primary_listing = next(
|
||||
(listing for listing in property_info["listings"] if listing["primary"]),
|
||||
None,
|
||||
)
|
||||
if primary_listing:
|
||||
return primary_listing["listing_id"]
|
||||
else:
|
||||
return property_info["listings"][0]["listing_id"]
|
||||
|
||||
def handle_address(self, property_id: str) -> list[Property]:
|
||||
"""
|
||||
Handles a specific address & returns one property
|
||||
"""
|
||||
query = """query Property($property_id: ID!) {
|
||||
property(id: $property_id) {
|
||||
property_id
|
||||
@@ -60,22 +201,19 @@ class RealtorScraper(Scraper):
|
||||
stories
|
||||
}
|
||||
address {
|
||||
address_validation_code
|
||||
city
|
||||
country
|
||||
county
|
||||
line
|
||||
postal_code
|
||||
state_code
|
||||
street_direction
|
||||
street_name
|
||||
street_number
|
||||
street_name
|
||||
street_suffix
|
||||
street_post_direction
|
||||
unit_value
|
||||
unit
|
||||
unit_descriptor
|
||||
zip
|
||||
city
|
||||
state_code
|
||||
postal_code
|
||||
location {
|
||||
coordinate {
|
||||
lat
|
||||
lon
|
||||
}
|
||||
}
|
||||
}
|
||||
basic {
|
||||
baths
|
||||
@@ -96,173 +234,370 @@ class RealtorScraper(Scraper):
|
||||
}
|
||||
}"""
|
||||
|
||||
variables = {
|
||||
'property_id': property_id
|
||||
}
|
||||
variables = {"property_id": property_id}
|
||||
|
||||
payload = {
|
||||
'query': query,
|
||||
'variables': variables,
|
||||
"query": query,
|
||||
"variables": variables,
|
||||
}
|
||||
|
||||
response = self.session.post(self.search_url, json=payload)
|
||||
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
|
||||
response_json = response.json()
|
||||
|
||||
property_info = response_json['data']['property']
|
||||
property_info = response_json["data"]["property"]
|
||||
|
||||
return [Property(
|
||||
site_name=self.site_name,
|
||||
address=Address(
|
||||
address_one=property_info['address']['line'],
|
||||
city=property_info['address']['city'],
|
||||
state=property_info['address']['state_code'],
|
||||
zip_code=property_info['address']['postal_code'],
|
||||
),
|
||||
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
|
||||
beds=property_info['basic']['beds'],
|
||||
baths=property_info['basic']['baths'],
|
||||
stories=property_info['details']['stories'],
|
||||
year_built=property_info['details']['year_built'],
|
||||
square_feet=property_info['basic']['sqft'],
|
||||
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
|
||||
if property_info['basic']['sqft'] is not None and
|
||||
property_info['basic']['price'] is not None
|
||||
else None,
|
||||
price=property_info['basic']['price'],
|
||||
mls_id=property_id,
|
||||
listing_type=self.listing_type,
|
||||
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
|
||||
)]
|
||||
return [
|
||||
Property(
|
||||
mls_id=property_id,
|
||||
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
|
||||
address=self._parse_address(
|
||||
property_info, search_type="handle_address"
|
||||
),
|
||||
description=self._parse_description(property_info),
|
||||
)
|
||||
]
|
||||
|
||||
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
|
||||
query = """query Home_search(
|
||||
$city: String,
|
||||
$county: [String],
|
||||
$state_code: String,
|
||||
$postal_code: String
|
||||
$offset: Int,
|
||||
) {
|
||||
home_search(
|
||||
query: {
|
||||
city: $city
|
||||
county: $county
|
||||
postal_code: $postal_code
|
||||
state_code: $state_code
|
||||
status: %s
|
||||
def general_search(
|
||||
self, variables: dict, search_type: str
|
||||
) -> Dict[str, Union[int, list[Property]]]:
|
||||
"""
|
||||
Handles a location area & returns a list of properties
|
||||
"""
|
||||
results_query = """{
|
||||
count
|
||||
total
|
||||
results {
|
||||
property_id
|
||||
list_date
|
||||
status
|
||||
last_sold_price
|
||||
last_sold_date
|
||||
list_price
|
||||
price_per_sqft
|
||||
description {
|
||||
sqft
|
||||
beds
|
||||
baths_full
|
||||
baths_half
|
||||
lot_sqft
|
||||
sold_price
|
||||
year_built
|
||||
garage
|
||||
sold_price
|
||||
type
|
||||
name
|
||||
stories
|
||||
}
|
||||
limit: 200
|
||||
offset: $offset
|
||||
) {
|
||||
count
|
||||
total
|
||||
results {
|
||||
property_id
|
||||
description {
|
||||
baths
|
||||
beds
|
||||
lot_sqft
|
||||
sqft
|
||||
text
|
||||
sold_price
|
||||
stories
|
||||
year_built
|
||||
garage
|
||||
unit_number
|
||||
floor_number
|
||||
}
|
||||
location {
|
||||
address {
|
||||
city
|
||||
country
|
||||
line
|
||||
postal_code
|
||||
state_code
|
||||
state
|
||||
street_direction
|
||||
street_name
|
||||
street_number
|
||||
street_post_direction
|
||||
street_suffix
|
||||
unit
|
||||
source {
|
||||
id
|
||||
listing_id
|
||||
}
|
||||
hoa {
|
||||
fee
|
||||
}
|
||||
location {
|
||||
address {
|
||||
street_number
|
||||
street_name
|
||||
street_suffix
|
||||
unit
|
||||
city
|
||||
state_code
|
||||
postal_code
|
||||
coordinate {
|
||||
lon
|
||||
lat
|
||||
}
|
||||
}
|
||||
list_price
|
||||
price_per_sqft
|
||||
source {
|
||||
id
|
||||
neighborhoods {
|
||||
name
|
||||
}
|
||||
}
|
||||
}
|
||||
}""" % self.listing_type.value
|
||||
}
|
||||
}"""
|
||||
|
||||
date_param = (
|
||||
'sold_date: { min: "$today-%sD" }' % self.last_x_days
|
||||
if self.listing_type == ListingType.SOLD and self.last_x_days
|
||||
else (
|
||||
'list_date: { min: "$today-%sD" }' % self.last_x_days
|
||||
if self.last_x_days
|
||||
else ""
|
||||
)
|
||||
)
|
||||
|
||||
sort_param = (
|
||||
"sort: [{ field: sold_date, direction: desc }]"
|
||||
if self.listing_type == ListingType.SOLD
|
||||
else "sort: [{ field: list_date, direction: desc }]"
|
||||
)
|
||||
|
||||
pending_or_contingent_param = (
|
||||
"or_filters: { contingent: true, pending: true }"
|
||||
if self.pending_or_contingent
|
||||
else ""
|
||||
)
|
||||
|
||||
if search_type == "comps": #: comps search, came from an address
|
||||
query = """query Property_search(
|
||||
$coordinates: [Float]!
|
||||
$radius: String!
|
||||
$offset: Int!,
|
||||
) {
|
||||
property_search(
|
||||
query: {
|
||||
nearby: {
|
||||
coordinates: $coordinates
|
||||
radius: $radius
|
||||
}
|
||||
status: %s
|
||||
%s
|
||||
}
|
||||
%s
|
||||
limit: 200
|
||||
offset: $offset
|
||||
) %s""" % (
|
||||
self.listing_type.value.lower(),
|
||||
date_param,
|
||||
sort_param,
|
||||
results_query,
|
||||
)
|
||||
elif search_type == "area": #: general search, came from a general location
|
||||
query = """query Home_search(
|
||||
$city: String,
|
||||
$county: [String],
|
||||
$state_code: String,
|
||||
$postal_code: String
|
||||
$offset: Int,
|
||||
) {
|
||||
home_search(
|
||||
query: {
|
||||
city: $city
|
||||
county: $county
|
||||
postal_code: $postal_code
|
||||
state_code: $state_code
|
||||
status: %s
|
||||
%s
|
||||
%s
|
||||
}
|
||||
%s
|
||||
limit: 200
|
||||
offset: $offset
|
||||
) %s""" % (
|
||||
self.listing_type.value.lower(),
|
||||
date_param,
|
||||
pending_or_contingent_param,
|
||||
sort_param,
|
||||
results_query,
|
||||
)
|
||||
else: #: general search, came from an address
|
||||
query = (
|
||||
"""query Property_search(
|
||||
$property_id: [ID]!
|
||||
$offset: Int!,
|
||||
) {
|
||||
property_search(
|
||||
query: {
|
||||
property_id: $property_id
|
||||
}
|
||||
limit: 1
|
||||
offset: $offset
|
||||
) %s"""
|
||||
% results_query
|
||||
)
|
||||
|
||||
payload = {
|
||||
'query': query,
|
||||
'variables': variables,
|
||||
"query": query,
|
||||
"variables": variables,
|
||||
}
|
||||
|
||||
response = self.session.post(self.search_url, json=payload)
|
||||
response = self.session.post(self.SEARCH_GQL_URL, json=payload)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
if return_total:
|
||||
return response_json['data']['home_search']['total']
|
||||
search_key = "home_search" if search_type == "area" else "property_search"
|
||||
|
||||
properties: list[Property] = []
|
||||
|
||||
for result in response_json['data']['home_search']['results']:
|
||||
realty_property = Property(
|
||||
address=Address(
|
||||
address_one=result['location']['address']['line'],
|
||||
city=result['location']['address']['city'],
|
||||
state=result['location']['address']['state_code'],
|
||||
zip_code=result['location']['address']['postal_code'],
|
||||
address_two=result['location']['address']['unit'],
|
||||
),
|
||||
site_name=self.site_name,
|
||||
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
|
||||
beds=result['description']['beds'],
|
||||
baths=result['description']['baths'],
|
||||
stories=result['description']['stories'],
|
||||
year_built=result['description']['year_built'],
|
||||
square_feet=result['description']['sqft'],
|
||||
price_per_square_foot=result['price_per_sqft'],
|
||||
price=result['list_price'],
|
||||
mls_id=result['property_id'],
|
||||
listing_type=self.listing_type,
|
||||
lot_size=result['description']['lot_sqft'],
|
||||
if (
|
||||
response_json is None
|
||||
or "data" not in response_json
|
||||
or response_json["data"] is None
|
||||
or search_key not in response_json["data"]
|
||||
or response_json["data"][search_key] is None
|
||||
or "results" not in response_json["data"][search_key]
|
||||
):
|
||||
return {"total": 0, "properties": []}
|
||||
|
||||
for result in response_json["data"][search_key]["results"]:
|
||||
self.counter += 1
|
||||
mls = (
|
||||
result["source"].get("id")
|
||||
if "source" in result and isinstance(result["source"], dict)
|
||||
else None
|
||||
)
|
||||
|
||||
if not mls and self.mls_only:
|
||||
continue
|
||||
|
||||
able_to_get_lat_long = (
|
||||
result
|
||||
and result.get("location")
|
||||
and result["location"].get("address")
|
||||
and result["location"]["address"].get("coordinate")
|
||||
)
|
||||
|
||||
realty_property = Property(
|
||||
mls=mls,
|
||||
mls_id=result["source"].get("listing_id")
|
||||
if "source" in result and isinstance(result["source"], dict)
|
||||
else None,
|
||||
property_url=f"{self.PROPERTY_URL}{result['property_id']}",
|
||||
status=result["status"].upper(),
|
||||
list_price=result["list_price"],
|
||||
list_date=result["list_date"].split("T")[0]
|
||||
if result.get("list_date")
|
||||
else None,
|
||||
prc_sqft=result.get("price_per_sqft"),
|
||||
last_sold_date=result.get("last_sold_date"),
|
||||
hoa_fee=result["hoa"]["fee"]
|
||||
if result.get("hoa") and isinstance(result["hoa"], dict)
|
||||
else None,
|
||||
latitude=result["location"]["address"]["coordinate"].get("lat")
|
||||
if able_to_get_lat_long
|
||||
else None,
|
||||
longitude=result["location"]["address"]["coordinate"].get("lon")
|
||||
if able_to_get_lat_long
|
||||
else None,
|
||||
address=self._parse_address(result, search_type="general_search"),
|
||||
#: neighborhoods=self._parse_neighborhoods(result),
|
||||
description=self._parse_description(result),
|
||||
)
|
||||
properties.append(realty_property)
|
||||
|
||||
return properties
|
||||
return {
|
||||
"total": response_json["data"][search_key]["total"],
|
||||
"properties": properties,
|
||||
}
|
||||
|
||||
def search(self):
|
||||
location_info = self.handle_location()
|
||||
location_type = location_info["area_type"]
|
||||
|
||||
if location_type == 'address':
|
||||
property_id = location_info['mpr_id']
|
||||
return self.handle_address(property_id)
|
||||
|
||||
offset = 0
|
||||
search_variables = {
|
||||
'city': location_info.get('city'),
|
||||
'county': location_info.get('county'),
|
||||
'state_code': location_info.get('state_code'),
|
||||
'postal_code': location_info.get('postal_code'),
|
||||
'offset': offset,
|
||||
"offset": 0,
|
||||
}
|
||||
|
||||
total = self.handle_area(search_variables, return_total=True)
|
||||
search_type = (
|
||||
"comps"
|
||||
if self.radius and location_type == "address"
|
||||
else "address"
|
||||
if location_type == "address" and not self.radius
|
||||
else "area"
|
||||
)
|
||||
if location_type == "address":
|
||||
if not self.radius: #: single address search, non comps
|
||||
property_id = location_info["mpr_id"]
|
||||
search_variables |= {"property_id": property_id}
|
||||
|
||||
gql_results = self.general_search(
|
||||
search_variables, search_type=search_type
|
||||
)
|
||||
if gql_results["total"] == 0:
|
||||
listing_id = self.get_latest_listing_id(property_id)
|
||||
if listing_id is None:
|
||||
return self.handle_address(property_id)
|
||||
else:
|
||||
return self.handle_listing(listing_id)
|
||||
else:
|
||||
return gql_results["properties"]
|
||||
|
||||
else: #: general search, comps (radius)
|
||||
coordinates = list(location_info["centroid"].values())
|
||||
search_variables |= {
|
||||
"coordinates": coordinates,
|
||||
"radius": "{}mi".format(self.radius),
|
||||
}
|
||||
|
||||
else: #: general search, location
|
||||
search_variables |= {
|
||||
"city": location_info.get("city"),
|
||||
"county": location_info.get("county"),
|
||||
"state_code": location_info.get("state_code"),
|
||||
"postal_code": location_info.get("postal_code"),
|
||||
}
|
||||
|
||||
result = self.general_search(search_variables, search_type=search_type)
|
||||
total = result["total"]
|
||||
homes = result["properties"]
|
||||
|
||||
homes = []
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
futures = [
|
||||
executor.submit(
|
||||
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
|
||||
) for i in range(0, total, 200)
|
||||
self.general_search,
|
||||
variables=search_variables | {"offset": i},
|
||||
search_type=search_type,
|
||||
)
|
||||
for i in range(200, min(total, 10000), 200)
|
||||
]
|
||||
|
||||
for future in as_completed(futures):
|
||||
homes.extend(future.result())
|
||||
homes.extend(future.result()["properties"])
|
||||
|
||||
return homes
|
||||
|
||||
@staticmethod
|
||||
def _parse_neighborhoods(result: dict) -> Optional[str]:
|
||||
neighborhoods_list = []
|
||||
neighborhoods = result["location"].get("neighborhoods", [])
|
||||
|
||||
if neighborhoods:
|
||||
for neighborhood in neighborhoods:
|
||||
name = neighborhood.get("name")
|
||||
if name:
|
||||
neighborhoods_list.append(name)
|
||||
|
||||
return ", ".join(neighborhoods_list) if neighborhoods_list else None
|
||||
|
||||
@staticmethod
|
||||
def _parse_address(result: dict, search_type):
|
||||
if search_type == "general_search":
|
||||
return Address(
|
||||
street=f"{result['location']['address']['street_number']} {result['location']['address']['street_name']} {result['location']['address']['street_suffix']}",
|
||||
unit=result["location"]["address"]["unit"],
|
||||
city=result["location"]["address"]["city"],
|
||||
state=result["location"]["address"]["state_code"],
|
||||
zip=result["location"]["address"]["postal_code"],
|
||||
)
|
||||
return Address(
|
||||
street=f"{result['address']['street_number']} {result['address']['street_name']} {result['address']['street_suffix']}",
|
||||
unit=result["address"]["unit"],
|
||||
city=result["address"]["city"],
|
||||
state=result["address"]["state_code"],
|
||||
zip=result["address"]["postal_code"],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _parse_description(result: dict) -> Description:
|
||||
description_data = result.get("description", {})
|
||||
|
||||
if description_data is None or not isinstance(description_data, dict):
|
||||
print("Warning: description_data is invalid!")
|
||||
description_data = {}
|
||||
|
||||
style = description_data.get("type", "")
|
||||
if style is not None:
|
||||
style = style.upper()
|
||||
|
||||
return Description(
|
||||
style=style,
|
||||
beds=description_data.get("beds"),
|
||||
baths_full=description_data.get("baths_full"),
|
||||
baths_half=description_data.get("baths_half"),
|
||||
sqft=description_data.get("sqft"),
|
||||
lot_sqft=description_data.get("lot_sqft"),
|
||||
sold_price=description_data.get("sold_price"),
|
||||
year_built=description_data.get("year_built"),
|
||||
garage=description_data.get("garage"),
|
||||
stories=description_data.get("stories"),
|
||||
)
|
||||
|
||||
@@ -1,158 +0,0 @@
|
||||
import json
|
||||
from ..models import Property, Address, PropertyType, Building
|
||||
from .. import Scraper
|
||||
from typing import Any
|
||||
|
||||
|
||||
class RedfinScraper(Scraper):
|
||||
def __init__(self, scraper_input):
|
||||
super().__init__(scraper_input)
|
||||
self.listing_type = scraper_input.listing_type
|
||||
|
||||
def _handle_location(self):
|
||||
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
|
||||
self.location
|
||||
)
|
||||
|
||||
response = self.session.get(url)
|
||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||
|
||||
def get_region_type(match_type: str):
|
||||
if match_type == "4":
|
||||
return "2" #: zip
|
||||
elif match_type == "2":
|
||||
return "6" #: city
|
||||
elif match_type == "1":
|
||||
return "address" #: address, needs to be handled differently
|
||||
|
||||
if response_json["payload"]["exactMatch"] is not None:
|
||||
target = response_json["payload"]["exactMatch"]
|
||||
else:
|
||||
target = response_json["payload"]["sections"][0]["rows"][0]
|
||||
|
||||
return target["id"].split("_")[1], get_region_type(target["type"])
|
||||
|
||||
def _parse_home(self, home: dict, single_search: bool = False) -> Property:
|
||||
def get_value(key: str) -> Any | None:
|
||||
if key in home and "value" in home[key]:
|
||||
return home[key]["value"]
|
||||
|
||||
if not single_search:
|
||||
address = Address(
|
||||
address_one=get_value("streetLine"),
|
||||
city=home["city"],
|
||||
state=home["state"],
|
||||
zip_code=home["zip"],
|
||||
)
|
||||
else:
|
||||
address_info = home["streetAddress"]
|
||||
|
||||
address = Address(
|
||||
address_one=address_info["assembledAddress"],
|
||||
city=home["city"],
|
||||
state=home["state"],
|
||||
zip_code=home["zip"],
|
||||
)
|
||||
url = "https://www.redfin.com{}".format(home["url"])
|
||||
property_type = home["propertyType"] if "propertyType" in home else None
|
||||
lot_size_data = home.get("lotSize")
|
||||
if not isinstance(lot_size_data, int):
|
||||
lot_size = (
|
||||
lot_size_data.get("value", None)
|
||||
if isinstance(lot_size_data, dict)
|
||||
else None
|
||||
)
|
||||
else:
|
||||
lot_size = lot_size_data
|
||||
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
listing_type=self.listing_type,
|
||||
address=address,
|
||||
url=url,
|
||||
beds=home["beds"] if "beds" in home else None,
|
||||
baths=home["baths"] if "baths" in home else None,
|
||||
stories=home["stories"] if "stories" in home else None,
|
||||
agent_name=get_value("listingAgent"),
|
||||
description=home["listingRemarks"] if "listingRemarks" in home else None,
|
||||
year_built=get_value("yearBuilt")
|
||||
if not single_search
|
||||
else home["yearBuilt"],
|
||||
square_feet=get_value("sqFt"),
|
||||
lot_size=lot_size,
|
||||
property_type=PropertyType.from_int_code(home.get("propertyType")),
|
||||
price_per_square_foot=get_value("pricePerSqFt"),
|
||||
price=get_value("price"),
|
||||
mls_id=get_value("mlsId"),
|
||||
)
|
||||
|
||||
def _parse_building(self, building: dict) -> Building:
|
||||
return Building(
|
||||
address=Address(
|
||||
address_one=" ".join(
|
||||
[
|
||||
building['address']['streetNumber'],
|
||||
building['address']['directionalPrefix'],
|
||||
building['address']['streetName'],
|
||||
building['address']['streetType'],
|
||||
]
|
||||
),
|
||||
city=building['address']['city'],
|
||||
state=building['address']['stateOrProvinceCode'],
|
||||
zip_code=building['address']['postalCode'],
|
||||
address_two=" ".join(
|
||||
[
|
||||
building['address']['unitType'],
|
||||
building['address']['unitValue'],
|
||||
]
|
||||
)
|
||||
),
|
||||
site_name=self.site_name,
|
||||
url="https://www.redfin.com{}".format(building["url"]),
|
||||
listing_type=self.listing_type,
|
||||
num_units=building["numUnitsForSale"],
|
||||
)
|
||||
|
||||
|
||||
def handle_address(self, home_id: str):
|
||||
"""
|
||||
EPs:
|
||||
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
|
||||
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
|
||||
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
|
||||
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
|
||||
"""
|
||||
|
||||
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
|
||||
home_id
|
||||
)
|
||||
|
||||
response = self.session.get(url)
|
||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||
|
||||
parsed_home = self._parse_home(
|
||||
response_json["payload"]["addressSectionInfo"], single_search=True
|
||||
)
|
||||
return [parsed_home]
|
||||
|
||||
def search(self):
|
||||
region_id, region_type = self._handle_location()
|
||||
|
||||
if region_type == "address":
|
||||
home_id = region_id
|
||||
return self.handle_address(home_id)
|
||||
|
||||
url = "https://www.redfin.com/stingray/api/gis?al=1®ion_id={}®ion_type={}".format(
|
||||
region_id, region_type
|
||||
)
|
||||
|
||||
response = self.session.get(url)
|
||||
response_json = json.loads(response.text.replace("{}&&", ""))
|
||||
|
||||
homes = [
|
||||
self._parse_home(home) for home in response_json["payload"]["homes"]
|
||||
] + [
|
||||
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
|
||||
]
|
||||
|
||||
return homes
|
||||
@@ -1,210 +0,0 @@
|
||||
import re
|
||||
import json
|
||||
from ..models import Property, Address, Building, ListingType, PropertyType
|
||||
from ....exceptions import NoResultsFound, PropertyNotFound
|
||||
from .. import Scraper
|
||||
|
||||
|
||||
class ZillowScraper(Scraper):
|
||||
def __init__(self, scraper_input):
|
||||
super().__init__(scraper_input)
|
||||
self.listing_type = scraper_input.listing_type
|
||||
if self.listing_type == ListingType.FOR_SALE:
|
||||
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
||||
elif self.listing_type == ListingType.FOR_RENT:
|
||||
self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/"
|
||||
|
||||
def search(self):
|
||||
resp = self.session.get(self.url, headers=self._get_headers())
|
||||
resp.raise_for_status()
|
||||
content = resp.text
|
||||
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not match:
|
||||
raise NoResultsFound(
|
||||
"No results were found for Zillow with the given Location."
|
||||
)
|
||||
|
||||
json_str = match.group(1)
|
||||
data = json.loads(json_str)
|
||||
|
||||
if "searchPageState" in data["props"]["pageProps"]:
|
||||
houses = data["props"]["pageProps"]["searchPageState"]["cat1"][
|
||||
"searchResults"
|
||||
]["listResults"]
|
||||
return [self._parse_home(house) for house in houses]
|
||||
elif "gdpClientCache" in data["props"]["pageProps"]:
|
||||
gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"])
|
||||
main_key = list(gdp_client_cache.keys())[0]
|
||||
|
||||
property_data = gdp_client_cache[main_key]["property"]
|
||||
property = self._get_single_property_page(property_data)
|
||||
|
||||
return [property]
|
||||
raise PropertyNotFound("Specific property data not found in the response.")
|
||||
|
||||
def _parse_home(self, home: dict):
|
||||
"""
|
||||
This method is used when a user enters a generic location & zillow returns more than one property
|
||||
"""
|
||||
url = (
|
||||
f"https://www.zillow.com{home['detailUrl']}"
|
||||
if "zillow.com" not in home["detailUrl"]
|
||||
else home["detailUrl"]
|
||||
)
|
||||
|
||||
if "hdpData" in home and "homeInfo" in home["hdpData"]:
|
||||
price_data = self._extract_price(home)
|
||||
address = self._extract_address(home)
|
||||
agent_name = self._extract_agent_name(home)
|
||||
beds = home["hdpData"]["homeInfo"]["bedrooms"]
|
||||
baths = home["hdpData"]["homeInfo"]["bathrooms"]
|
||||
property_type = home["hdpData"]["homeInfo"].get("homeType")
|
||||
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
address=address,
|
||||
agent_name=agent_name,
|
||||
url=url,
|
||||
beds=beds,
|
||||
baths=baths,
|
||||
listing_type=self.listing_type,
|
||||
property_type=PropertyType(property_type),
|
||||
**price_data,
|
||||
)
|
||||
else:
|
||||
keys = ("addressStreet", "addressCity", "addressState", "addressZipcode")
|
||||
address_one, city, state, zip_code = (home[key] for key in keys)
|
||||
address_one, address_two = self._parse_address_two(address_one)
|
||||
address = Address(address_one, city, state, zip_code, address_two)
|
||||
|
||||
building_info = self._extract_building_info(home)
|
||||
return Building(
|
||||
site_name=self.site_name, address=address, url=url, **building_info
|
||||
)
|
||||
|
||||
def _get_single_property_page(self, property_data: dict):
|
||||
"""
|
||||
This method is used when a user enters the exact location & zillow returns just one property
|
||||
"""
|
||||
url = (
|
||||
f"https://www.zillow.com{property_data['hdpUrl']}"
|
||||
if "zillow.com" not in property_data["hdpUrl"]
|
||||
else property_data["hdpUrl"]
|
||||
)
|
||||
address_data = property_data["address"]
|
||||
address_one, address_two = self._parse_address_two(
|
||||
address_data["streetAddress"]
|
||||
)
|
||||
address = Address(
|
||||
address_one=address_one,
|
||||
address_two=address_two,
|
||||
city=address_data["city"],
|
||||
state=address_data["state"],
|
||||
zip_code=address_data["zipcode"],
|
||||
)
|
||||
property_type = property_data.get("homeType", None)
|
||||
|
||||
return Property(
|
||||
site_name=self.site_name,
|
||||
address=address,
|
||||
url=url,
|
||||
beds=property_data.get("bedrooms", None),
|
||||
baths=property_data.get("bathrooms", None),
|
||||
year_built=property_data.get("yearBuilt", None),
|
||||
price=property_data.get("price", None),
|
||||
lot_size=property_data.get("lotSize", None),
|
||||
agent_name=property_data.get("attributionInfo", {}).get("agentName", None),
|
||||
stories=property_data.get("resoFacts", {}).get("stories", None),
|
||||
description=property_data.get("description", None),
|
||||
mls_id=property_data.get("attributionInfo", {}).get("mlsId", None),
|
||||
price_per_square_foot=property_data.get("resoFacts", {}).get(
|
||||
"pricePerSquareFoot", None
|
||||
),
|
||||
square_feet=property_data.get("livingArea", None),
|
||||
property_type=PropertyType(property_type),
|
||||
listing_type=self.listing_type,
|
||||
)
|
||||
|
||||
def _extract_building_info(self, home: dict) -> dict:
|
||||
num_units = len(home["units"])
|
||||
prices = [
|
||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||
for unit in home["units"]
|
||||
]
|
||||
return {
|
||||
"listing_type": self.listing_type,
|
||||
"num_units": len(home["units"]),
|
||||
"min_unit_price": min(
|
||||
(
|
||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||
for unit in home["units"]
|
||||
)
|
||||
),
|
||||
"max_unit_price": max(
|
||||
(
|
||||
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
|
||||
for unit in home["units"]
|
||||
)
|
||||
),
|
||||
"avg_unit_price": sum(prices) // len(prices) if num_units else None,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _extract_price(home: dict) -> dict:
|
||||
price = int(home["hdpData"]["homeInfo"]["priceForHDP"])
|
||||
square_feet = home["hdpData"]["homeInfo"].get("livingArea")
|
||||
|
||||
lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue")
|
||||
price_per_square_foot = price // square_feet if square_feet and price else None
|
||||
|
||||
return {
|
||||
k: v
|
||||
for k, v in locals().items()
|
||||
if k in ["price", "square_feet", "lot_size", "price_per_square_foot"]
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _extract_agent_name(home: dict) -> str | None:
|
||||
broker_str = home.get("brokerName", "")
|
||||
match = re.search(r"Listing by: (.+)", broker_str)
|
||||
return match.group(1) if match else None
|
||||
|
||||
@staticmethod
|
||||
def _parse_address_two(address_one: str):
|
||||
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
|
||||
address_two = apt_match.group().strip() if apt_match else None
|
||||
address_one = (
|
||||
address_one.replace(address_two, "").strip() if address_two else address_one
|
||||
)
|
||||
return address_one, address_two
|
||||
|
||||
@staticmethod
|
||||
def _extract_address(home: dict) -> Address:
|
||||
keys = ("streetAddress", "city", "state", "zipcode")
|
||||
address_one, city, state, zip_code = (
|
||||
home["hdpData"]["homeInfo"][key] for key in keys
|
||||
)
|
||||
address_one, address_two = ZillowScraper._parse_address_two(address_one)
|
||||
return Address(address_one, city, state, zip_code, address_two=address_two)
|
||||
|
||||
@staticmethod
|
||||
def _get_headers():
|
||||
return {
|
||||
"authority": "parser-external.geo.moveaws.com",
|
||||
"accept": "*/*",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"origin": "https://www.zillow.com",
|
||||
"referer": "https://www.zillow.com/",
|
||||
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||
}
|
||||
@@ -1,14 +1,6 @@
|
||||
class InvalidSite(Exception):
|
||||
"""Raised when a provided site is does not exist."""
|
||||
|
||||
|
||||
class InvalidListingType(Exception):
|
||||
"""Raised when a provided listing type is does not exist."""
|
||||
|
||||
|
||||
class NoResultsFound(Exception):
|
||||
"""Raised when no results are found for the given location"""
|
||||
|
||||
|
||||
class PropertyNotFound(Exception):
|
||||
"""Raised when no property is found for the given address"""
|
||||
|
||||
71
homeharvest/utils.py
Normal file
71
homeharvest/utils.py
Normal file
@@ -0,0 +1,71 @@
|
||||
from .core.scrapers.models import Property, ListingType
|
||||
import pandas as pd
|
||||
from .exceptions import InvalidListingType
|
||||
|
||||
ordered_properties = [
|
||||
"property_url",
|
||||
"mls",
|
||||
"mls_id",
|
||||
"status",
|
||||
"style",
|
||||
"street",
|
||||
"unit",
|
||||
"city",
|
||||
"state",
|
||||
"zip_code",
|
||||
"beds",
|
||||
"full_baths",
|
||||
"half_baths",
|
||||
"sqft",
|
||||
"year_built",
|
||||
"list_price",
|
||||
"list_date",
|
||||
"sold_price",
|
||||
"last_sold_date",
|
||||
"lot_sqft",
|
||||
"price_per_sqft",
|
||||
"latitude",
|
||||
"longitude",
|
||||
"stories",
|
||||
"hoa_fee",
|
||||
"parking_garage",
|
||||
]
|
||||
|
||||
|
||||
def process_result(result: Property) -> pd.DataFrame:
|
||||
prop_data = {prop: None for prop in ordered_properties}
|
||||
prop_data.update(result.__dict__)
|
||||
|
||||
if "address" in prop_data:
|
||||
address_data = prop_data["address"]
|
||||
prop_data["street"] = address_data.street
|
||||
prop_data["unit"] = address_data.unit
|
||||
prop_data["city"] = address_data.city
|
||||
prop_data["state"] = address_data.state
|
||||
prop_data["zip_code"] = address_data.zip
|
||||
|
||||
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
|
||||
|
||||
description = result.description
|
||||
prop_data["style"] = description.style
|
||||
prop_data["beds"] = description.beds
|
||||
prop_data["full_baths"] = description.baths_full
|
||||
prop_data["half_baths"] = description.baths_half
|
||||
prop_data["sqft"] = description.sqft
|
||||
prop_data["lot_sqft"] = description.lot_sqft
|
||||
prop_data["sold_price"] = description.sold_price
|
||||
prop_data["year_built"] = description.year_built
|
||||
prop_data["parking_garage"] = description.garage
|
||||
prop_data["stories"] = description.stories
|
||||
|
||||
properties_df = pd.DataFrame([prop_data])
|
||||
properties_df = properties_df.reindex(columns=ordered_properties)
|
||||
|
||||
return properties_df[ordered_properties]
|
||||
|
||||
|
||||
def validate_input(listing_type: str) -> None:
|
||||
if listing_type.upper() not in ListingType.__members__:
|
||||
raise InvalidListingType(
|
||||
f"Provided listing type, '{listing_type}', does not exist."
|
||||
)
|
||||
38
poetry.lock
generated
38
poetry.lock
generated
@@ -106,6 +106,17 @@ files = [
|
||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "1.1.0"
|
||||
description = "An implementation of lxml.xmlfile for the standard library"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
|
||||
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.1.3"
|
||||
@@ -217,6 +228,20 @@ files = [
|
||||
{file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.2"
|
||||
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
|
||||
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
et-xmlfile = "*"
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "23.1"
|
||||
@@ -383,6 +408,17 @@ files = [
|
||||
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tls-client"
|
||||
version = "0.2.2"
|
||||
description = "Advanced Python HTTP Client."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "tls_client-0.2.2-py3-none-any.whl", hash = "sha256:30934871397cdad6862e00b5634f382666314a452ddd3d774e18323a0ad9b765"},
|
||||
{file = "tls_client-0.2.2.tar.gz", hash = "sha256:78bc0e291e3aadc6c5e903b62bb26c01374577691f2a9e5e17899900a5927a13"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.0.1"
|
||||
@@ -425,4 +461,4 @@ zstd = ["zstandard (>=0.18.0)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "eede625d6d45085e143b0af246cb2ce00cff8579c667be3b63387c8594a5570d"
|
||||
content-hash = "9b77e1a09fcf2cf5e7e6be53f304cd21a6a51ea51680d661a178afe5e5343670"
|
||||
|
||||
@@ -1,15 +1,20 @@
|
||||
[tool.poetry]
|
||||
name = "homeharvest"
|
||||
version = "0.1.3"
|
||||
description = "Real estate scraping library"
|
||||
version = "0.3.1"
|
||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
homeharvest = "homeharvest.cli:main"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
requests = "^2.31.0"
|
||||
pandas = "^2.1.0"
|
||||
openpyxl = "^3.1.2"
|
||||
tls-client = "^0.2.2"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
||||
@@ -1,12 +1,96 @@
|
||||
from homeharvest import scrape_property
|
||||
from homeharvest.exceptions import (
|
||||
InvalidListingType,
|
||||
NoResultsFound,
|
||||
)
|
||||
|
||||
|
||||
def test_realtor_pending_or_contingent():
|
||||
pending_or_contingent_result = scrape_property(
|
||||
location="Surprise, AZ",
|
||||
pending_or_contingent=True,
|
||||
)
|
||||
|
||||
regular_result = scrape_property(
|
||||
location="Surprise, AZ",
|
||||
pending_or_contingent=False,
|
||||
)
|
||||
|
||||
assert all(
|
||||
[
|
||||
result is not None
|
||||
for result in [pending_or_contingent_result, regular_result]
|
||||
]
|
||||
)
|
||||
assert len(pending_or_contingent_result) != len(regular_result)
|
||||
|
||||
|
||||
def test_realtor_comps():
|
||||
result = scrape_property(
|
||||
location="2530 Al Lipscomb Way",
|
||||
radius=0.5,
|
||||
property_younger_than=180,
|
||||
listing_type="sold",
|
||||
)
|
||||
|
||||
assert result is not None and len(result) > 0
|
||||
|
||||
|
||||
def test_realtor_last_x_days_sold():
|
||||
days_result_30 = scrape_property(
|
||||
location="Dallas, TX", listing_type="sold", property_younger_than=30
|
||||
)
|
||||
|
||||
days_result_10 = scrape_property(
|
||||
location="Dallas, TX", listing_type="sold", property_younger_than=10
|
||||
)
|
||||
|
||||
assert all(
|
||||
[result is not None for result in [days_result_30, days_result_10]]
|
||||
) and len(days_result_30) != len(days_result_10)
|
||||
|
||||
|
||||
def test_realtor_single_property():
|
||||
results = [
|
||||
scrape_property(
|
||||
location="15509 N 172nd Dr, Surprise, AZ 85388",
|
||||
listing_type="for_sale",
|
||||
),
|
||||
scrape_property(
|
||||
location="2530 Al Lipscomb Way",
|
||||
listing_type="for_sale",
|
||||
),
|
||||
]
|
||||
|
||||
assert all([result is not None for result in results])
|
||||
|
||||
|
||||
def test_realtor():
|
||||
results = [
|
||||
scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"),
|
||||
scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format
|
||||
scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format
|
||||
scrape_property(location="85281", site_name="realtor.com"),
|
||||
scrape_property(
|
||||
location="2530 Al Lipscomb Way",
|
||||
listing_type="for_sale",
|
||||
),
|
||||
scrape_property(
|
||||
location="Phoenix, AZ", listing_type="for_rent"
|
||||
), #: does not support "city, state, USA" format
|
||||
scrape_property(
|
||||
location="Dallas, TX", listing_type="sold"
|
||||
), #: does not support "city, state, USA" format
|
||||
scrape_property(location="85281"),
|
||||
]
|
||||
|
||||
assert all([result is not None for result in results])
|
||||
|
||||
bad_results = []
|
||||
try:
|
||||
bad_results += [
|
||||
scrape_property(
|
||||
location="abceefg ju098ot498hh9",
|
||||
listing_type="for_sale",
|
||||
)
|
||||
]
|
||||
except (InvalidListingType, NoResultsFound):
|
||||
assert True
|
||||
|
||||
assert all([result is None for result in bad_results])
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
from homeharvest import scrape_property
|
||||
|
||||
|
||||
def test_redfin():
|
||||
results = [
|
||||
scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"),
|
||||
scrape_property(location="Phoenix, AZ, USA", site_name="redfin"),
|
||||
scrape_property(location="Dallas, TX, USA", site_name="redfin"),
|
||||
scrape_property(location="85281", site_name="redfin"),
|
||||
]
|
||||
|
||||
assert all([result is not None for result in results])
|
||||
@@ -1,12 +0,0 @@
|
||||
from homeharvest import scrape_property
|
||||
|
||||
|
||||
def test_zillow():
|
||||
results = [
|
||||
scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"),
|
||||
scrape_property(location="Phoenix, AZ, USA", site_name="zillow"),
|
||||
scrape_property(location="Dallas, TX, USA", site_name="zillow"),
|
||||
scrape_property(location="85281", site_name="zillow"),
|
||||
]
|
||||
|
||||
assert all([result is not None for result in results])
|
||||
Reference in New Issue
Block a user