diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index d2f34e0..ff813a7 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -29,7 +29,6 @@ def _scrape_single_site( """ Helper function to scrape a single site. """ - print(status) _validate_input(site_name, status) scraper_input = ScraperInput( @@ -42,7 +41,6 @@ def _scrape_single_site( site = _scrapers[site_name.lower()](scraper_input) results = site.search() - print(f"Found {len(results)} results for {site_name}") properties_dfs = [process_result(result) for result in results] if not properties_dfs: @@ -53,7 +51,7 @@ def _scrape_single_site( def scrape_property( location: str, - timeframe: str, + timeframe: str = None, site_name: Union[str, list[str]] = None, status: str = "sale", proxy: str = None, diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 79f4e93..1eb9fd3 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -18,8 +18,12 @@ class ScraperInput: timeframe: Optional[str] = None def __post_init__(self): + if self.status == "sold" and not self.timeframe: + raise InvalidTimeFrame("Timeframe is required when status is 'sold'") + if self.timeframe and self.timeframe not in VALID_TIMEFRAMES: raise InvalidTimeFrame(f"Invalid timeframe provided: {self.timeframe}") + if self.status and self.status not in VALID_STATUSES: raise InvalidTimeFrame(f"Invalid status provided: {self.status}") diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index d0e3352..b93fa13 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -258,9 +258,8 @@ class RealtorScraper(Scraper): self.status, f'"$nowUTC-{self.timeframe}"', ) - payload = { - "query": query, + "query": self.get_query(), "variables": variables, } response = self.session.post(self.endpoint, json=payload) @@ -314,7 +313,6 @@ class RealtorScraper(Scraper): + result["property_id"], mls=mls, mls_id=mls_id, - # status=(result["source"]["raw"].get("status").upper() if 'source' in result and isinstance(result["source"], dict) and "raw" in result["source"] and isinstance(result["source"]["raw"], dict) else None), status=result["status"].upper(), style=result["description"]["type"].upper(), beds=result["description"]["beds"], @@ -323,7 +321,9 @@ class RealtorScraper(Scraper): est_sf=result["description"]["sqft"], lot_sf=result["description"]["lot_sqft"], list_price=result["list_price"], - list_date=result["list_date"].split("T")[0], + list_date=result["list_date"].split("T")[0] + if result["list_date"] + else None, sold_price=result["description"]["sold_price"], prc_sqft=result["price_per_sqft"], last_sold_date=result["last_sold_date"], @@ -363,6 +363,230 @@ class RealtorScraper(Scraper): "properties": properties, } + def get_query(self): + if self.status == "sold": + return """query Home_search( + $city: String, + $county: [String], + $state_code: String, + $postal_code: String, + $offset: Int + ) { + home_search( + query: { + city: $city + county: $county + postal_code: $postal_code + state_code: $state_code + status: %s + sold_date: { + min: %s + } + } + limit: 200 + offset: $offset + sort: [ + { + field: sold_date, + direction: desc + } + ] + ) { + count + total + results { + property_id + list_date + status + last_sold_price + last_sold_date + hoa { + fee + } + description { + baths_full + baths_half + beds + lot_sqft + sqft + sold_price + year_built + garage + sold_price + type + sub_type + name + stories + } + source { + raw { + area + status + style + } + last_update_date + contract_date + id + listing_id + name + type + listing_href + community_id + management_id + corporation_id + subdivision_status + spec_id + plan_id + tier_rank + feed_type + } + location { + address { + city + country + line + postal_code + state_code + state + coordinate { + lon + lat + } + street_direction + street_name + street_number + street_post_direction + street_suffix + unit + } + neighborhoods { + name + } + } + list_price + price_per_sqft + style_category_tags { + exterior} + + source { + id + } + } + } + }""" % ( + self.status, + f'"$nowUTC-{self.timeframe}"', + ) + else: + return """query Home_search( + $city: String, + $county: [String], + $state_code: String, + $postal_code: String, + $offset: Int + ) { + home_search( + query: { + city: $city + county: $county + postal_code: $postal_code + state_code: $state_code + status: %s + } + limit: 200 + offset: $offset + sort: [ + { + field: sold_date, + direction: desc + } + ] + ) { + count + total + results { + property_id + list_date + status + last_sold_price + last_sold_date + hoa { + fee + } + description { + baths_full + baths_half + beds + lot_sqft + sqft + sold_price + year_built + garage + sold_price + type + sub_type + name + stories + } + source { + raw { + area + status + style + } + last_update_date + contract_date + id + listing_id + name + type + listing_href + community_id + management_id + corporation_id + subdivision_status + spec_id + plan_id + tier_rank + feed_type + } + location { + address { + city + country + line + postal_code + state_code + state + coordinate { + lon + lat + } + street_direction + street_name + street_number + street_post_direction + street_suffix + unit + } + neighborhoods { + name + } + } + list_price + price_per_sqft + style_category_tags { + exterior} + + source { + id + } + } + } + }""" % ( + self.status, + ) + def search(self): location_info = self.handle_location() location_type = location_info["area_type"]