Merge pull request #40 from Bunsly/date_range

Add date_to and date_from params
pull/42/head v0.3.8
Cullen Watson 2023-11-03 18:42:13 -05:00 committed by GitHub
commit d3268d8e5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 108 additions and 60 deletions

View File

@ -46,7 +46,11 @@ filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property( properties = scrape_property(
location="San Diego, CA", location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent, pending) listing_type="sold", # or (for_sale, for_rent, pending)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent) past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
# date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28",
# mls_only=True, # only fetch MLS listings # mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address # proxy="http://user:pass@host:port" # use a proxy to change your IP address
) )
@ -57,34 +61,6 @@ properties.to_csv(filename, index=False)
print(properties.head()) print(properties.head())
``` ```
### CLI
```
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location
Home Harvest Property Scraper
positional arguments:
location Location to scrape (e.g., San Francisco, CA)
options:
-l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending}
Listing type to scrape
-o {excel,csv}, --output {excel,csv}
Output format
-f FILENAME, --filename FILENAME
Name of the output file (without extension)
-p PROXY, --proxy PROXY
Proxy to use for scraping
-d DAYS, --days DAYS Sold/listed in last _ days filter.
-r RADIUS, --radius RADIUS
Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses.
-m, --mls_only If set, fetches only MLS listings.
```
```bash
homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
```
## Output ## Output
```plaintext ```plaintext
@ -115,11 +91,45 @@ Optional
├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale). ├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).
│ Example: 30 (fetches properties listed/sold in the last 30 days) │ Example: 30 (fetches properties listed/sold in the last 30 days)
├── date_range (string tuple): Start and end dates to filter properties listed or sold, both dates are required.
} (use this to get properties in chunks as there's a 10k result limit)
│ Format for both must be "YYYY-MM-DD".
│ Example: ("2023-05-01", "2023-05-15") (fetches properties listed/sold between these dates)
├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) ├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
└── proxy (string): In format 'http://user:pass@host:port' └── proxy (string): In format 'http://user:pass@host:port'
``` ```
### CLI
```
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location
Home Harvest Property Scraper
positional arguments:
location Location to scrape (e.g., San Francisco, CA)
options:
-l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending}
Listing type to scrape
-o {excel,csv}, --output {excel,csv}
Output format
-f FILENAME, --filename FILENAME
Name of the output file (without extension)
-p PROXY, --proxy PROXY
Proxy to use for scraping
-d DAYS, --days DAYS Sold/listed in last _ days filter.
-r RADIUS, --radius RADIUS
Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses.
-m, --mls_only If set, fetches only MLS listings.
```
```bash
homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
```
### Property Schema ### Property Schema
```plaintext ```plaintext
Property Property

View File

@ -1,10 +1,9 @@
import warnings import warnings
import pandas as pd import pandas as pd
from .core.scrapers import ScraperInput from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input from .utils import process_result, ordered_properties, validate_input, validate_dates
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType from .core.scrapers.models import ListingType
from .exceptions import InvalidListingType, NoResultsFound
def scrape_property( def scrape_property(
@ -14,6 +13,8 @@ def scrape_property(
mls_only: bool = False, mls_only: bool = False,
past_days: int = None, past_days: int = None,
proxy: str = None, proxy: str = None,
date_from: str = None,
date_to: str = None,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape properties from Realtor.com based on a given location and listing type. Scrape properties from Realtor.com based on a given location and listing type.
@ -22,9 +23,11 @@ def scrape_property(
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs. :param mls_only: If set, fetches only listings with MLS IDs.
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days. :param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param proxy: Proxy to use for scraping :param proxy: Proxy to use for scraping
""" """
validate_input(listing_type) validate_input(listing_type)
validate_dates(date_from, date_to)
scraper_input = ScraperInput( scraper_input = ScraperInput(
location=location, location=location,
@ -33,6 +36,8 @@ def scrape_property(
radius=radius, radius=radius,
mls_only=mls_only, mls_only=mls_only,
last_x_days=past_days, last_x_days=past_days,
date_from=date_from,
date_to=date_to,
) )
site = RealtorScraper(scraper_input) site = RealtorScraper(scraper_input)
@ -40,7 +45,7 @@ def scrape_property(
properties_dfs = [process_result(result) for result in results] properties_dfs = [process_result(result) for result in results]
if not properties_dfs: if not properties_dfs:
raise NoResultsFound("no results found for the query") return pd.DataFrame()
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)

View File

@ -11,6 +11,8 @@ class ScraperInput:
mls_only: bool | None = None mls_only: bool | None = None
proxy: str | None = None proxy: str | None = None
last_x_days: int | None = None last_x_days: int | None = None
date_from: str | None = None
date_to: str | None = None
class Scraper: class Scraper:
@ -36,6 +38,8 @@ class Scraper:
self.radius = scraper_input.radius self.radius = scraper_input.radius
self.last_x_days = scraper_input.last_x_days self.last_x_days = scraper_input.last_x_days
self.mls_only = scraper_input.mls_only self.mls_only = scraper_input.mls_only
self.date_from = scraper_input.date_from
self.date_to = scraper_input.date_to
def search(self) -> list[Property]: def search(self) -> list[Property]:
... ...

View File

@ -9,7 +9,6 @@ from typing import Dict, Union, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .. import Scraper from .. import Scraper
from ....exceptions import NoResultsFound
from ..models import Property, Address, ListingType, Description from ..models import Property, Address, ListingType, Description
@ -38,7 +37,7 @@ class RealtorScraper(Scraper):
result = response_json["autocomplete"] result = response_json["autocomplete"]
if not result: if not result:
raise NoResultsFound("No results found for location: " + self.location) return None
return result[0] return result[0]
@ -336,15 +335,17 @@ class RealtorScraper(Scraper):
} }
}""" }"""
date_param = ( date_param = ""
'sold_date: { min: "$today-%sD" }' % self.last_x_days if self.listing_type == ListingType.SOLD:
if self.listing_type == ListingType.SOLD and self.last_x_days if self.date_from and self.date_to:
else ( date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
'list_date: { min: "$today-%sD" }' % self.last_x_days elif self.last_x_days:
if self.last_x_days date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}'
else "" else:
) if self.date_from and self.date_to:
) date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
elif self.last_x_days:
date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}'
sort_param = ( sort_param = (
"sort: [{ field: sold_date, direction: desc }]" "sort: [{ field: sold_date, direction: desc }]"
@ -509,6 +510,9 @@ class RealtorScraper(Scraper):
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
if not location_info:
return []
location_type = location_info["area_type"] location_type = location_info["area_type"]
search_variables = { search_variables = {

View File

@ -1,6 +1,5 @@
class InvalidListingType(Exception): class InvalidListingType(Exception):
"""Raised when a provided listing type is does not exist.""" """Raised when a provided listing type is does not exist."""
class InvalidDate(Exception):
class NoResultsFound(Exception): """Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23 """
"""Raised when no results are found for the given location"""

View File

@ -1,6 +1,7 @@
from .core.scrapers.models import Property, ListingType
import pandas as pd import pandas as pd
from .exceptions import InvalidListingType from datetime import datetime
from .core.scrapers.models import Property, ListingType
from .exceptions import InvalidListingType, InvalidDate
ordered_properties = [ ordered_properties = [
"property_url", "property_url",
@ -70,3 +71,18 @@ def validate_input(listing_type: str) -> None:
raise InvalidListingType( raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist." f"Provided listing type, '{listing_type}', does not exist."
) )
def validate_dates(date_from: str | None, date_to: str | None) -> None:
if (date_from is not None and date_to is None) or (date_from is None and date_to is not None):
raise InvalidDate("Both date_from and date_to must be provided.")
if date_from and date_to:
try:
date_from_obj = datetime.strptime(date_from, "%Y-%m-%d")
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")
if date_to_obj < date_from_obj:
raise InvalidDate("date_to must be after date_from.")
except ValueError as e:
raise InvalidDate(f"Invalid date format or range")

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.7" version = "0.3.8"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@ -1,7 +1,6 @@
from homeharvest import scrape_property from homeharvest import scrape_property
from homeharvest.exceptions import ( from homeharvest.exceptions import (
InvalidListingType, InvalidListingType,
NoResultsFound,
) )
@ -85,6 +84,20 @@ def test_realtor_last_x_days_sold():
) and len(days_result_30) != len(days_result_10) ) and len(days_result_30) != len(days_result_10)
def test_realtor_date_range_sold():
days_result_30 = scrape_property(
location="Dallas, TX", listing_type="sold", date_from="2023-05-01", date_to="2023-05-28"
)
days_result_60 = scrape_property(
location="Dallas, TX", listing_type="sold", date_from="2023-04-01", date_to="2023-06-10"
)
assert all(
[result is not None for result in [days_result_30, days_result_60]]
) and len(days_result_30) < len(days_result_60)
def test_realtor_single_property(): def test_realtor_single_property():
results = [ results = [
scrape_property( scrape_property(
@ -117,15 +130,12 @@ def test_realtor():
assert all([result is not None for result in results]) assert all([result is not None for result in results])
bad_results = []
try: def test_realtor_bad_address():
bad_results += [ bad_results = scrape_property(
scrape_property(
location="abceefg ju098ot498hh9", location="abceefg ju098ot498hh9",
listing_type="for_sale", listing_type="for_sale",
) )
] if len(bad_results) == 0:
except (InvalidListingType, NoResultsFound):
assert True assert True
assert all([result is None for result in bad_results])