Compare commits

...

12 Commits

Author SHA1 Message Date
Zachary Hampton
c597a78191 - None address bug fix 2023-10-18 16:32:43 -07:00
Zachary Hampton
11a7d854f0 - remove pending listings from for_sale 2023-10-18 14:41:41 -07:00
Zachary Hampton
f726548cc6 Update pyproject.toml 2023-10-18 09:35:48 -07:00
Zachary Hampton
fad7d670eb Update README.md 2023-10-18 08:37:42 -07:00
Zachary Hampton
89a6f93c9f Update pyproject.toml 2023-10-18 08:37:26 -07:00
Zachary Hampton
e1090b06e4 Update README.md 2023-10-17 20:22:25 -07:00
Cullen Watson
5036e74b60 Merge branch 'master' of https://github.com/ZacharyHampton/HomeHarvest 2023-10-09 11:30:17 -05:00
Cullen Watson
2cb544bc8d [chore] display clickable URLs in jupyter 2023-10-09 11:28:56 -05:00
Zachary Hampton
68cb365e03 Merge pull request #34 from ZacharyHampton/days_on_mls
[enh] days_on_mls attr
2023-10-09 09:04:59 -07:00
Cullen Watson
23876d5725 [chore] function types 2023-10-09 11:02:51 -05:00
Cullen Watson
b59d55f6b5 [enh] days_on_mls attr 2023-10-09 11:00:36 -05:00
Cullen Watson
3c3adb5f29 [docs] update video 2023-10-05 20:24:23 -05:00
7 changed files with 121 additions and 42 deletions

View File

@@ -6,9 +6,9 @@
**Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com). **Not technical?** Try out the web scraping tool on our site at [tryhomeharvest.com](https://tryhomeharvest.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.* *Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to work with us.*
Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/JobSpy)** a Python package for job scraping* Check out another project we wrote: ***[JobSpy](https://github.com/Bunsly/JobSpy)** a Python package for job scraping*
## HomeHarvest Features ## HomeHarvest Features
@@ -20,7 +20,7 @@ Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/
- **CLI**: For users who prefer command-line operations. - **CLI**: For users who prefer command-line operations.
[Video Guide for HomeHarvest](https://youtu.be/JnV7eR2Ve2o) - _updated for release v0.2.7_ [Video Guide for HomeHarvest](https://youtu.be/J1qgNPgmSLI) - _updated for release v0.3.4_
![homeharvest](https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/b3d5d727-e67b-4a9f-85d8-1e65fd18620a) ![homeharvest](https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/b3d5d727-e67b-4a9f-85d8-1e65fd18620a)
@@ -147,6 +147,7 @@ Property
│ └── lot_sqft │ └── lot_sqft
├── Property Listing Details: ├── Property Listing Details:
│ ├── days_on_mls
│ ├── list_price │ ├── list_price
│ ├── list_date │ ├── list_date
│ ├── sold_price │ ├── sold_price

View File

@@ -4,7 +4,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "cb48903e-5021-49fe-9688-45cd0bc05d0f", "id": "cb48903e-5021-49fe-9688-45cd0bc05d0f",
"metadata": {}, "metadata": {
"is_executing": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from homeharvest import scrape_property\n", "from homeharvest import scrape_property\n",
@@ -84,10 +86,34 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# check sold properties\n", "# check sold properties\n",
"scrape_property(\n", "properties = scrape_property(\n",
" location=\"90210\",\n", " location=\"90210\",\n",
" listing_type=\"sold\"\n", " listing_type=\"sold\",\n",
")" " past_days=10\n",
")\n",
"display(properties)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "628c1ce2",
"metadata": {
"collapsed": false,
"is_executing": true,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"# display clickable URLs\n",
"from IPython.display import display, HTML\n",
"properties['property_url'] = '<a href=\"' + properties['property_url'] + '\" target=\"_blank\">' + properties['property_url'] + '</a>'\n",
"\n",
"html = properties.to_html(escape=False)\n",
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
"display(HTML(truncate_width))"
] ]
} }
], ],

View File

@@ -59,6 +59,7 @@ class Property:
last_sold_date: str | None = None last_sold_date: str | None = None
prc_sqft: int | None = None prc_sqft: int | None = None
hoa_fee: int | None = None hoa_fee: int | None = None
days_on_mls: int | None = None
description: Description | None = None description: Description | None = None
latitude: float | None = None latitude: float | None = None

View File

@@ -4,6 +4,7 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com This module implements the scraper for realtor.com
""" """
from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -104,11 +105,29 @@ class RealtorScraper(Scraper):
) )
able_to_get_lat_long = ( able_to_get_lat_long = (
property_info property_info
and property_info.get("address") and property_info.get("address")
and property_info["address"].get("location") and property_info["address"].get("location")
and property_info["address"]["location"].get("coordinate") and property_info["address"]["location"].get("coordinate")
) )
list_date_str = property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get(
"list_date") else None
last_sold_date_str = property_info["basic"]["sold_date"].split("T")[0] if property_info["basic"].get(
"sold_date") else None
list_date = datetime.strptime(list_date_str, "%Y-%m-%d") if list_date_str else None
last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None
today = datetime.now()
days_on_mls = None
status = property_info["basic"]["status"].lower()
if list_date:
if status == "sold" and last_sold_date:
days_on_mls = (last_sold_date - list_date).days
elif status in ('for_sale', 'for_rent'):
days_on_mls = (today - list_date).days
if days_on_mls and days_on_mls < 0:
days_on_mls = None
listing = Property( listing = Property(
mls=mls, mls=mls,
@@ -118,17 +137,13 @@ class RealtorScraper(Scraper):
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}", property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
status=property_info["basic"]["status"].upper(), status=property_info["basic"]["status"].upper(),
list_price=property_info["basic"]["price"], list_price=property_info["basic"]["price"],
list_date=property_info["basic"]["list_date"].split("T")[0] list_date=list_date,
if property_info["basic"].get("list_date")
else None,
prc_sqft=property_info["basic"].get("price") prc_sqft=property_info["basic"].get("price")
/ property_info["basic"].get("sqft") / property_info["basic"].get("sqft")
if property_info["basic"].get("price") if property_info["basic"].get("price")
and property_info["basic"].get("sqft") and property_info["basic"].get("sqft")
else None,
last_sold_date=property_info["basic"]["sold_date"].split("T")[0]
if property_info["basic"].get("sold_date")
else None, else None,
last_sold_date=last_sold_date,
latitude=property_info["address"]["location"]["coordinate"].get("lat") latitude=property_info["address"]["location"]["coordinate"].get("lat")
if able_to_get_lat_long if able_to_get_lat_long
else None, else None,
@@ -148,6 +163,7 @@ class RealtorScraper(Scraper):
garage=property_info["details"].get("garage"), garage=property_info["details"].get("garage"),
stories=property_info["details"].get("stories"), stories=property_info["details"].get("stories"),
), ),
days_on_mls=days_on_mls
) )
return [listing] return [listing]
@@ -455,6 +471,9 @@ class RealtorScraper(Scraper):
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
if is_pending and self.listing_type != ListingType.PENDING:
continue
realty_property = Property( realty_property = Property(
mls=mls, mls=mls,
mls_id=result["source"].get("listing_id") mls_id=result["source"].get("listing_id")
@@ -478,8 +497,8 @@ class RealtorScraper(Scraper):
if able_to_get_lat_long if able_to_get_lat_long
else None, else None,
address=self._parse_address(result, search_type="general_search"), address=self._parse_address(result, search_type="general_search"),
#: neighborhoods=self._parse_neighborhoods(result),
description=self._parse_description(result), description=self._parse_description(result),
days_on_mls=self.calculate_days_on_mls(result)
) )
properties.append(realty_property) properties.append(realty_property)
@@ -568,21 +587,28 @@ class RealtorScraper(Scraper):
return ", ".join(neighborhoods_list) if neighborhoods_list else None return ", ".join(neighborhoods_list) if neighborhoods_list else None
@staticmethod @staticmethod
def _parse_address(result: dict, search_type): def handle_none_safely(address_part):
if address_part is None:
return ""
return address_part
def _parse_address(self, result: dict, search_type):
if search_type == "general_search": if search_type == "general_search":
return Address( address = result['location']['address']
street=f"{result['location']['address']['street_number']} {result['location']['address']['street_name']} {result['location']['address']['street_suffix']}", else:
unit=result["location"]["address"]["unit"], address = result["address"]
city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"],
zip=result["location"]["address"]["postal_code"],
)
return Address( return Address(
street=f"{result['address']['street_number']} {result['address']['street_name']} {result['address']['street_suffix']}", street=" ".join([
unit=result["address"]["unit"], self.handle_none_safely(address.get('street_number')),
city=result["address"]["city"], self.handle_none_safely(address.get('street_name')),
state=result["address"]["state_code"], self.handle_none_safely(address.get('street_suffix')),
zip=result["address"]["postal_code"], ]).strip(),
unit=address["unit"],
city=address["city"],
state=address["state_code"],
zip=address["postal_code"],
) )
@staticmethod @staticmethod
@@ -590,7 +616,6 @@ class RealtorScraper(Scraper):
description_data = result.get("description", {}) description_data = result.get("description", {})
if description_data is None or not isinstance(description_data, dict): if description_data is None or not isinstance(description_data, dict):
print("Warning: description_data is invalid!")
description_data = {} description_data = {}
style = description_data.get("type", "") style = description_data.get("type", "")
@@ -609,3 +634,22 @@ class RealtorScraper(Scraper):
garage=description_data.get("garage"), garage=description_data.get("garage"),
stories=description_data.get("stories"), stories=description_data.get("stories"),
) )
@staticmethod
def calculate_days_on_mls(result: dict) -> Optional[int]:
list_date_str = result.get("list_date")
list_date = datetime.strptime(list_date_str.split("T")[0], "%Y-%m-%d") if list_date_str else None
last_sold_date_str = result.get("last_sold_date")
last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None
today = datetime.now()
if list_date:
if result["status"] == 'sold':
if last_sold_date:
days = (last_sold_date - list_date).days
if days >= 0:
return days
elif result["status"] in ('for_sale', 'for_rent'):
days = (today - list_date).days
if days >= 0:
return days

View File

@@ -18,6 +18,7 @@ ordered_properties = [
"half_baths", "half_baths",
"sqft", "sqft",
"year_built", "year_built",
"days_on_mls",
"list_price", "list_price",
"list_date", "list_date",
"sold_price", "sold_price",

View File

@@ -1,9 +1,9 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.4" version = "0.3.7"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/ZacharyHampton/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
readme = "README.md" readme = "README.md"
[tool.poetry.scripts] [tool.poetry.scripts]

View File

@@ -7,14 +7,10 @@ from homeharvest.exceptions import (
def test_realtor_pending_or_contingent(): def test_realtor_pending_or_contingent():
pending_or_contingent_result = scrape_property( pending_or_contingent_result = scrape_property(
location="Surprise, AZ", location="Surprise, AZ", listing_type="pending"
listing_type="pending"
) )
regular_result = scrape_property( regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale")
location="Surprise, AZ",
listing_type="for_sale"
)
assert all( assert all(
[ [
@@ -54,6 +50,16 @@ def test_realtor_pending_comps():
assert len(set([len(result) for result in results])) == len(results) assert len(set([len(result) for result in results])) == len(results)
def test_realtor_sold_past():
result = scrape_property(
location="San Diego, CA",
past_days=30,
listing_type="sold",
)
assert result is not None and len(result) > 0
def test_realtor_comps(): def test_realtor_comps():
result = scrape_property( result = scrape_property(
location="2530 Al Lipscomb Way", location="2530 Al Lipscomb Way",