[fix] scrape property params

pull/30/head
Cullen Watson 2023-10-03 21:47:15 -05:00
parent 1464b4f7d4
commit bd33c3b5a4
3 changed files with 233 additions and 7 deletions

View File

@ -29,7 +29,6 @@ def _scrape_single_site(
""" """
Helper function to scrape a single site. Helper function to scrape a single site.
""" """
print(status)
_validate_input(site_name, status) _validate_input(site_name, status)
scraper_input = ScraperInput( scraper_input = ScraperInput(
@ -42,7 +41,6 @@ def _scrape_single_site(
site = _scrapers[site_name.lower()](scraper_input) site = _scrapers[site_name.lower()](scraper_input)
results = site.search() results = site.search()
print(f"Found {len(results)} results for {site_name}")
properties_dfs = [process_result(result) for result in results] properties_dfs = [process_result(result) for result in results]
if not properties_dfs: if not properties_dfs:
@ -53,7 +51,7 @@ def _scrape_single_site(
def scrape_property( def scrape_property(
location: str, location: str,
timeframe: str, timeframe: str = None,
site_name: Union[str, list[str]] = None, site_name: Union[str, list[str]] = None,
status: str = "sale", status: str = "sale",
proxy: str = None, proxy: str = None,

View File

@ -18,8 +18,12 @@ class ScraperInput:
timeframe: Optional[str] = None timeframe: Optional[str] = None
def __post_init__(self): def __post_init__(self):
if self.status == "sold" and not self.timeframe:
raise InvalidTimeFrame("Timeframe is required when status is 'sold'")
if self.timeframe and self.timeframe not in VALID_TIMEFRAMES: if self.timeframe and self.timeframe not in VALID_TIMEFRAMES:
raise InvalidTimeFrame(f"Invalid timeframe provided: {self.timeframe}") raise InvalidTimeFrame(f"Invalid timeframe provided: {self.timeframe}")
if self.status and self.status not in VALID_STATUSES: if self.status and self.status not in VALID_STATUSES:
raise InvalidTimeFrame(f"Invalid status provided: {self.status}") raise InvalidTimeFrame(f"Invalid status provided: {self.status}")

View File

@ -258,9 +258,8 @@ class RealtorScraper(Scraper):
self.status, self.status,
f'"$nowUTC-{self.timeframe}"', f'"$nowUTC-{self.timeframe}"',
) )
payload = { payload = {
"query": query, "query": self.get_query(),
"variables": variables, "variables": variables,
} }
response = self.session.post(self.endpoint, json=payload) response = self.session.post(self.endpoint, json=payload)
@ -314,7 +313,6 @@ class RealtorScraper(Scraper):
+ result["property_id"], + result["property_id"],
mls=mls, mls=mls,
mls_id=mls_id, mls_id=mls_id,
# status=(result["source"]["raw"].get("status").upper() if 'source' in result and isinstance(result["source"], dict) and "raw" in result["source"] and isinstance(result["source"]["raw"], dict) else None),
status=result["status"].upper(), status=result["status"].upper(),
style=result["description"]["type"].upper(), style=result["description"]["type"].upper(),
beds=result["description"]["beds"], beds=result["description"]["beds"],
@ -323,7 +321,9 @@ class RealtorScraper(Scraper):
est_sf=result["description"]["sqft"], est_sf=result["description"]["sqft"],
lot_sf=result["description"]["lot_sqft"], lot_sf=result["description"]["lot_sqft"],
list_price=result["list_price"], list_price=result["list_price"],
list_date=result["list_date"].split("T")[0], list_date=result["list_date"].split("T")[0]
if result["list_date"]
else None,
sold_price=result["description"]["sold_price"], sold_price=result["description"]["sold_price"],
prc_sqft=result["price_per_sqft"], prc_sqft=result["price_per_sqft"],
last_sold_date=result["last_sold_date"], last_sold_date=result["last_sold_date"],
@ -363,6 +363,230 @@ class RealtorScraper(Scraper):
"properties": properties, "properties": properties,
} }
def get_query(self):
if self.status == "sold":
return """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String,
$offset: Int
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
sold_date: {
min: %s
}
}
limit: 200
offset: $offset
sort: [
{
field: sold_date,
direction: desc
}
]
) {
count
total
results {
property_id
list_date
status
last_sold_price
last_sold_date
hoa {
fee
}
description {
baths_full
baths_half
beds
lot_sqft
sqft
sold_price
year_built
garage
sold_price
type
sub_type
name
stories
}
source {
raw {
area
status
style
}
last_update_date
contract_date
id
listing_id
name
type
listing_href
community_id
management_id
corporation_id
subdivision_status
spec_id
plan_id
tier_rank
feed_type
}
location {
address {
city
country
line
postal_code
state_code
state
coordinate {
lon
lat
}
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
neighborhoods {
name
}
}
list_price
price_per_sqft
style_category_tags {
exterior}
source {
id
}
}
}
}""" % (
self.status,
f'"$nowUTC-{self.timeframe}"',
)
else:
return """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String,
$offset: Int
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
}
limit: 200
offset: $offset
sort: [
{
field: sold_date,
direction: desc
}
]
) {
count
total
results {
property_id
list_date
status
last_sold_price
last_sold_date
hoa {
fee
}
description {
baths_full
baths_half
beds
lot_sqft
sqft
sold_price
year_built
garage
sold_price
type
sub_type
name
stories
}
source {
raw {
area
status
style
}
last_update_date
contract_date
id
listing_id
name
type
listing_href
community_id
management_id
corporation_id
subdivision_status
spec_id
plan_id
tier_rank
feed_type
}
location {
address {
city
country
line
postal_code
state_code
state
coordinate {
lon
lat
}
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
neighborhoods {
name
}
}
list_price
price_per_sqft
style_category_tags {
exterior}
source {
id
}
}
}
}""" % (
self.status,
)
def search(self): def search(self):
location_info = self.handle_location() location_info = self.handle_location()
location_type = location_info["area_type"] location_type = location_info["area_type"]