- fix limit parameter

- fix specific for_rent apartment listing prices
pull/102/head v0.3.34
Zachary Hampton 2024-08-13 10:44:11 -07:00
parent 3f44744d61
commit 6d14b8df5a
6 changed files with 81 additions and 9 deletions

View File

@ -128,6 +128,8 @@ Property
├── Property Listing Details:
│ ├── days_on_mls
│ ├── list_price
│ ├── list_price_min
│ ├── list_price_max
│ ├── list_date
│ ├── pending_date
│ ├── sold_price

View File

@ -113,6 +113,9 @@ class Property:
address: Address | None = None
list_price: int | None = None
list_price_min: int | None = None
list_price_max: int | None = None
list_date: str | None = None
pending_date: str | None = None
last_sold_date: str | None = None

View File

@ -20,6 +20,7 @@ class RealtorScraper(Scraper):
PROPERTY_GQL = "https://graph.realtor.com/graphql"
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
NUM_PROPERTY_WORKERS = 20
DEFAULT_PAGE_SIZE = 200
def __init__(self, scraper_input):
super().__init__(scraper_input)
@ -76,7 +77,6 @@ class RealtorScraper(Scraper):
baths_half
lot_sqft
sold_price
sold_price
type
price
status
@ -326,6 +326,8 @@ class RealtorScraper(Scraper):
last_sold_price
last_sold_date
list_price
list_price_max
list_price_min
price_per_sqft
flags {
is_contingent
@ -551,6 +553,8 @@ class RealtorScraper(Scraper):
),
status="PENDING" if is_pending else result["status"].upper(),
list_price=result["list_price"],
list_price_min=result["list_price_min"],
list_price_max=result["list_price_max"],
list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"),
@ -571,9 +575,17 @@ class RealtorScraper(Scraper):
)
return realty_property
properties_list = response_json["data"][search_key]["results"]
total_properties = response_json["data"][search_key]["total"]
offset = variables.get("offset", 0)
#: limit the number of properties to be processed
#: example, if your offset is 200, and your limit is 250, return 50
properties_list = properties_list[:self.limit - offset]
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [
executor.submit(process_property, result) for result in response_json["data"][search_key]["results"]
executor.submit(process_property, result) for result in properties_list
]
for future in as_completed(futures):
@ -582,7 +594,7 @@ class RealtorScraper(Scraper):
properties.append(result)
return {
"total": response_json["data"][search_key]["total"],
"total": total_properties,
"properties": properties,
}
@ -654,7 +666,7 @@ class RealtorScraper(Scraper):
variables=search_variables | {"offset": i},
search_type=search_type,
)
for i in range(200, min(total, self.limit), 200)
for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE)
]
for future in as_completed(futures):

View File

@ -24,6 +24,8 @@ ordered_properties = [
"year_built",
"days_on_mls",
"list_price",
"list_price_min",
"list_price_max",
"list_date",
"sold_price",
"last_sold_date",

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.33"
version = "0.3.34"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest"

View File

@ -105,8 +105,8 @@ def test_realtor():
location="2530 Al Lipscomb Way",
listing_type="for_sale",
),
scrape_property(location="Phoenix, AZ", listing_type="for_rent"), #: does not support "city, state, USA" format
scrape_property(location="Dallas, TX", listing_type="sold"), #: does not support "city, state, USA" format
scrape_property(location="Phoenix, AZ", listing_type="for_rent", limit=1000), #: does not support "city, state, USA" format
scrape_property(location="Dallas, TX", listing_type="sold", limit=1000), #: does not support "city, state, USA" format
scrape_property(location="85281"),
]
@ -117,6 +117,7 @@ def test_realtor_city():
results = scrape_property(
location="Atlanta, GA",
listing_type="for_sale",
limit=1000
)
assert results is not None and len(results) > 0
@ -140,7 +141,7 @@ def test_realtor_foreclosed():
def test_realtor_agent():
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale")
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale", limit=1000)
assert scraped["agent"].nunique() > 1
@ -182,6 +183,58 @@ def test_style_value_error():
location="Alaska, AK",
listing_type="sold",
extra_property_data=False,
limit=1000,
)
assert results is not None and len(results) > 0
def test_primary_image_error():
results = scrape_property(
location="Spokane, PA",
listing_type="for_rent", # or (for_sale, for_rent, pending)
past_days=360,
radius=3,
extra_property_data=False,
)
assert results is not None and len(results) > 0
def test_limit():
over_limit = 876
extra_params = {"limit": over_limit}
over_results = scrape_property(
location="Waddell, AZ",
listing_type="for_sale",
**extra_params,
)
assert over_results is not None and len(over_results) <= over_limit
under_limit = 1
under_results = scrape_property(
location="Waddell, AZ",
listing_type="for_sale",
limit=under_limit,
)
assert under_results is not None and len(under_results) == under_limit
def test_apartment_list_price():
results = scrape_property(
location="Spokane, WA",
listing_type="for_rent", # or (for_sale, for_rent, pending)
extra_property_data=False,
)
assert results is not None
results = results[results["style"] == "APARTMENT"]
#: get percentage of results with atleast 1 of any column not none, list_price, list_price_min, list_price_max
assert len(results[results[["list_price", "list_price_min", "list_price_max"]].notnull().any(axis=1)]) / len(
results
) > 0.5