Compare commits

..

10 Commits

Author SHA1 Message Date
Zachary Hampton
00537329cf - version bump 2023-10-04 21:35:21 -07:00
Zachary Hampton
a9225b532f - rename days variable 2023-10-04 21:35:14 -07:00
Zachary Hampton
ba7ad069c9 Merge pull request #32 from ZacharyHampton/key_error
[fix] keyerror on style
2023-10-04 20:35:05 -07:00
Cullen Watson
22bda972b0 [chore] version number 2023-10-04 22:34:52 -05:00
Cullen Watson
6f5bbf79a4 [fix] keyerror on style 2023-10-04 22:33:21 -05:00
Cullen Watson
608cceba34 [docs] reorder 2023-10-04 22:12:16 -05:00
Cullen Watson
3609586995 [docs]: add contingent to example 2023-10-04 22:11:38 -05:00
Cullen Watson
68c7e411e4 [docs] pending / contingent searches 2023-10-04 22:07:51 -05:00
Cullen Watson
5e825601a7 [docs] update example 2023-10-04 21:50:54 -05:00
Cullen Watson
ce3f94d0af [docs] update example 2023-10-04 21:50:16 -05:00
7 changed files with 106 additions and 54 deletions

View File

@@ -17,8 +17,9 @@ Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/
- **Data Format**: Structures data to resemble MLS listings.
- **Export Flexibility**: Options to save as either CSV or Excel.
- **Usage Modes**:
- **CLI**: For users who prefer command-line operations.
- **Python**: For those who'd like to integrate scraping into their Python scripts.
- **CLI**: For users who prefer command-line operations.
[Video Guide for HomeHarvest](https://youtu.be/JnV7eR2Ve2o) - _updated for release v0.2.7_
@@ -33,10 +34,35 @@ pip install homeharvest
## Usage
### Python
```py
from homeharvest import scrape_property
from datetime import datetime
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
# pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
# mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
)
print(f"Number of properties: {len(properties)}")
# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())
```
### CLI
```
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] location
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location
Home Harvest Property Scraper
@@ -54,35 +80,16 @@ options:
Proxy to use for scraping
-d DAYS, --days DAYS Sold/listed in last _ days filter.
-r RADIUS, --radius RADIUS
Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.
-m, --mls_only If set, fetches only MLS listings.
Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses.
-m, --mls_only If set, fetches only MLS listings.
-c, --pending_or_contingent
If set, fetches only pending or contingent listings. Only applicable for for_sale listings from general area searches.
```
```bash
> homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
```
### Python
```py
from homeharvest import scrape_property
from datetime import datetime
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"output/{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent)
property_younger_than=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
mls_only=True, # only fetch MLS listings
)
print(f"Number of properties: {len(properties)}")
# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())
```
## Output
```plaintext
@@ -111,6 +118,8 @@ Optional
├── property_younger_than (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).
│ Example: 30 (fetches properties listed/sold in the last 30 days)
|
├── pending_or_contingent (True/False): If set, fetches only pending or contingent listings. Only applicable for `for_sale listings` from general area searches.
├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)

View File

@@ -3,16 +3,18 @@ from datetime import datetime
# Generate filename based on current timestamp
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"output/{current_timestamp}.csv"
filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # for_sale, for_rent
property_younger_than=30, # sold/listed in last 30 days
mls_only=True, # only fetch MLS listings
listing_type="sold", # or (for_sale, for_rent)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
# pending_or_contingent=True # use on for_sale listings to find pending / contingent listings
# mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
)
print(f"Number of properties: {len(properties)}")
# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())
print(properties.head())

View File

@@ -12,7 +12,7 @@ def scrape_property(
listing_type: str = "for_sale",
radius: float = None,
mls_only: bool = False,
property_younger_than: int = None,
past_days: int = None,
pending_or_contingent: bool = False,
proxy: str = None,
) -> pd.DataFrame:
@@ -22,7 +22,7 @@ def scrape_property(
:param listing_type: Listing Type (for_sale, for_rent, sold)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs.
:param property_younger_than: Get properties sold/listed in last _ days.
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param pending_or_contingent: If set, fetches only pending or contingent listings. Only applicable for for_sale listings from general area searches.
:param proxy: Proxy to use for scraping
"""
@@ -34,7 +34,7 @@ def scrape_property(
proxy=proxy,
radius=radius,
mls_only=mls_only,
last_x_days=property_younger_than,
last_x_days=past_days,
pending_or_contingent=pending_or_contingent,
)

View File

@@ -60,6 +60,13 @@ def main():
help="If set, fetches only MLS listings.",
)
parser.add_argument(
"-c",
"--pending_or_contingent",
action="store_true",
help="If set, fetches only pending or contingent listings. Only applicable for for_sale listings from general area searches.",
)
args = parser.parse_args()
result = scrape_property(
@@ -68,7 +75,8 @@ def main():
radius=args.radius,
proxy=args.proxy,
mls_only=args.mls_only,
property_younger_than=args.days,
past_days=args.days,
pending_or_contingent=args.pending_or_contingent,
)
if not args.filename:

View File

@@ -105,10 +105,10 @@ class RealtorScraper(Scraper):
)
able_to_get_lat_long = (
property_info
and property_info.get("address")
and property_info["address"].get("location")
and property_info["address"]["location"].get("coordinate")
property_info
and property_info.get("address")
and property_info["address"].get("location")
and property_info["address"]["location"].get("coordinate")
)
listing = Property(
@@ -122,8 +122,10 @@ class RealtorScraper(Scraper):
list_date=property_info["basic"]["list_date"].split("T")[0]
if property_info["basic"].get("list_date")
else None,
prc_sqft=property_info["basic"].get("price") / property_info["basic"].get("sqft")
if property_info["basic"].get("price") and property_info["basic"].get("sqft")
prc_sqft=property_info["basic"].get("price")
/ property_info["basic"].get("sqft")
if property_info["basic"].get("price")
and property_info["basic"].get("sqft")
else None,
last_sold_date=property_info["basic"]["sold_date"].split("T")[0]
if property_info["basic"].get("sold_date")
@@ -146,7 +148,7 @@ class RealtorScraper(Scraper):
year_built=property_info["details"].get("year_built"),
garage=property_info["details"].get("garage"),
stories=property_info["details"].get("stories"),
)
),
)
return [listing]
@@ -175,7 +177,10 @@ class RealtorScraper(Scraper):
if property_info["listings"] is None:
return None
primary_listing = next((listing for listing in property_info["listings"] if listing["primary"]), None)
primary_listing = next(
(listing for listing in property_info["listings"] if listing["primary"]),
None,
)
if primary_listing:
return primary_listing["listing_id"]
else:
@@ -328,7 +333,11 @@ class RealtorScraper(Scraper):
else "sort: [{ field: list_date, direction: desc }]"
)
pending_or_contingent_param = "or_filters: { contingent: true, pending: true }" if self.pending_or_contingent else ""
pending_or_contingent_param = (
"or_filters: { contingent: true, pending: true }"
if self.pending_or_contingent
else ""
)
if search_type == "comps": #: comps search, came from an address
query = """query Property_search(
@@ -384,7 +393,7 @@ class RealtorScraper(Scraper):
)
else: #: general search, came from an address
query = (
"""query Property_search(
"""query Property_search(
$property_id: [ID]!
$offset: Int!,
) {
@@ -394,7 +403,9 @@ class RealtorScraper(Scraper):
}
limit: 1
offset: $offset
) %s""" % results_query)
) %s"""
% results_query
)
payload = {
"query": query,
@@ -477,13 +488,21 @@ class RealtorScraper(Scraper):
"offset": 0,
}
search_type = "comps" if self.radius and location_type == "address" else "address" if location_type == "address" and not self.radius else "area"
search_type = (
"comps"
if self.radius and location_type == "address"
else "address"
if location_type == "address" and not self.radius
else "area"
)
if location_type == "address":
if not self.radius: #: single address search, non comps
property_id = location_info["mpr_id"]
search_variables |= {"property_id": property_id}
gql_results = self.general_search(search_variables, search_type=search_type)
gql_results = self.general_search(
search_variables, search_type=search_type
)
if gql_results["total"] == 0:
listing_id = self.get_latest_listing_id(property_id)
if listing_id is None:
@@ -561,8 +580,17 @@ class RealtorScraper(Scraper):
@staticmethod
def _parse_description(result: dict) -> Description:
description_data = result.get("description", {})
if description_data is None or not isinstance(description_data, dict):
print("Warning: description_data is invalid!")
description_data = {}
style = description_data.get("type", "")
if style is not None:
style = style.upper()
return Description(
style=description_data.get("type", "").upper(),
style=style,
beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"),

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.0"
version = "0.3.2"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest"

View File

@@ -16,7 +16,12 @@ def test_realtor_pending_or_contingent():
pending_or_contingent=False,
)
assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
assert all(
[
result is not None
for result in [pending_or_contingent_result, regular_result]
]
)
assert len(pending_or_contingent_result) != len(regular_result)
@@ -24,7 +29,7 @@ def test_realtor_comps():
result = scrape_property(
location="2530 Al Lipscomb Way",
radius=0.5,
property_younger_than=180,
past_days=180,
listing_type="sold",
)
@@ -33,11 +38,11 @@ def test_realtor_comps():
def test_realtor_last_x_days_sold():
days_result_30 = scrape_property(
location="Dallas, TX", listing_type="sold", property_younger_than=30
location="Dallas, TX", listing_type="sold", past_days=30
)
days_result_10 = scrape_property(
location="Dallas, TX", listing_type="sold", property_younger_than=10
location="Dallas, TX", listing_type="sold", past_days=10
)
assert all(