parent
80186ee8c5
commit
62e3321277
|
@ -112,7 +112,9 @@ def _scrape_single_site(
|
||||||
results = site.search()
|
results = site.search()
|
||||||
|
|
||||||
properties_dfs = [process_result(result) for result in results]
|
properties_dfs = [process_result(result) for result in results]
|
||||||
properties_dfs = [df.dropna(axis=1, how='all') for df in properties_dfs if not df.empty]
|
properties_dfs = [
|
||||||
|
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
|
||||||
|
]
|
||||||
if not properties_dfs:
|
if not properties_dfs:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
@ -147,7 +149,9 @@ def scrape_property(
|
||||||
else:
|
else:
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
futures = {
|
futures = {
|
||||||
executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
|
executor.submit(
|
||||||
|
_scrape_single_site, location, s_name, listing_type
|
||||||
|
): s_name
|
||||||
for s_name in site_name
|
for s_name in site_name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -169,5 +173,7 @@ def scrape_property(
|
||||||
if col not in final_df.columns:
|
if col not in final_df.columns:
|
||||||
final_df[col] = None
|
final_df[col] = None
|
||||||
|
|
||||||
final_df = final_df.drop_duplicates(subset=["street_address", "city", "unit"], keep="first")
|
final_df = final_df.drop_duplicates(
|
||||||
|
subset=["street_address", "city", "unit"], keep="first"
|
||||||
|
)
|
||||||
return final_df
|
return final_df
|
||||||
|
|
|
@ -249,8 +249,20 @@ class RealtorScraper(Scraper):
|
||||||
unit=parse_unit(result["location"]["address"]["unit"]),
|
unit=parse_unit(result["location"]["address"]["unit"]),
|
||||||
country="USA",
|
country="USA",
|
||||||
),
|
),
|
||||||
latitude=result["location"]["address"]["coordinate"]["lat"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lat" in result["location"]["address"]["coordinate"] else None,
|
latitude=result["location"]["address"]["coordinate"]["lat"]
|
||||||
longitude=result["location"]["address"]["coordinate"]["lon"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lon" in result["location"]["address"]["coordinate"] else None,
|
if result
|
||||||
|
and result.get("location")
|
||||||
|
and result["location"].get("address")
|
||||||
|
and result["location"]["address"].get("coordinate")
|
||||||
|
and "lat" in result["location"]["address"]["coordinate"]
|
||||||
|
else None,
|
||||||
|
longitude=result["location"]["address"]["coordinate"]["lon"]
|
||||||
|
if result
|
||||||
|
and result.get("location")
|
||||||
|
and result["location"].get("address")
|
||||||
|
and result["location"]["address"].get("coordinate")
|
||||||
|
and "lon" in result["location"]["address"]["coordinate"]
|
||||||
|
else None,
|
||||||
site_name=self.site_name,
|
site_name=self.site_name,
|
||||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||||
+ result["property_id"],
|
+ result["property_id"],
|
||||||
|
|
|
@ -27,8 +27,10 @@ class RedfinScraper(Scraper):
|
||||||
elif match_type == "1":
|
elif match_type == "1":
|
||||||
return "address" #: address, needs to be handled differently
|
return "address" #: address, needs to be handled differently
|
||||||
|
|
||||||
if "exactMatch" not in response_json['payload']:
|
if "exactMatch" not in response_json["payload"]:
|
||||||
raise NoResultsFound("No results found for location: {}".format(self.location))
|
raise NoResultsFound(
|
||||||
|
"No results found for location: {}".format(self.location)
|
||||||
|
)
|
||||||
|
|
||||||
if response_json["payload"]["exactMatch"] is not None:
|
if response_json["payload"]["exactMatch"] is not None:
|
||||||
target = response_json["payload"]["exactMatch"]
|
target = response_json["payload"]["exactMatch"]
|
||||||
|
@ -98,8 +100,12 @@ class RedfinScraper(Scraper):
|
||||||
price_per_sqft=get_value("pricePerSqFt"),
|
price_per_sqft=get_value("pricePerSqFt"),
|
||||||
price=get_value("price"),
|
price=get_value("price"),
|
||||||
mls_id=get_value("mlsId"),
|
mls_id=get_value("mlsId"),
|
||||||
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
|
latitude=home["latLong"]["latitude"]
|
||||||
longitude = home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None
|
if "latLong" in home and "latitude" in home["latLong"]
|
||||||
|
else None,
|
||||||
|
longitude=home["latLong"]["longitude"]
|
||||||
|
if "latLong" in home and "longitude" in home["latLong"]
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_building(self, building: dict) -> Property:
|
def _parse_building(self, building: dict) -> Property:
|
||||||
|
|
|
@ -1,15 +1,18 @@
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import string
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ....utils import parse_address_two, parse_unit
|
from ....utils import parse_address_two, parse_unit
|
||||||
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||||
from ..models import Property, Address, ListingType, PropertyType, SiteName
|
from ..models import Property, Address, ListingType, PropertyType
|
||||||
|
|
||||||
|
|
||||||
class ZillowScraper(Scraper):
|
class ZillowScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
super().__init__(scraper_input)
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
|
if not self.is_plausible_location(self.location):
|
||||||
|
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
||||||
if self.listing_type == ListingType.FOR_SALE:
|
if self.listing_type == ListingType.FOR_SALE:
|
||||||
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
||||||
elif self.listing_type == ListingType.FOR_RENT:
|
elif self.listing_type == ListingType.FOR_RENT:
|
||||||
|
@ -17,6 +20,18 @@ class ZillowScraper(Scraper):
|
||||||
else:
|
else:
|
||||||
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_plausible_location(location: str) -> bool:
|
||||||
|
blocks = location.split()
|
||||||
|
for block in blocks:
|
||||||
|
if (
|
||||||
|
any(char.isdigit() for char in block)
|
||||||
|
and any(char.isalpha() for char in block)
|
||||||
|
and len(block) > 6
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
resp = self.session.get(self.url, headers=self._get_headers())
|
resp = self.session.get(self.url, headers=self._get_headers())
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_realtor():
|
def test_realtor():
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_redfin():
|
def test_redfin():
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
from homeharvest import scrape_property
|
from homeharvest import scrape_property
|
||||||
from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound
|
from homeharvest.exceptions import (
|
||||||
|
InvalidSite,
|
||||||
|
InvalidListingType,
|
||||||
|
NoResultsFound,
|
||||||
|
GeoCoordsNotFound,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_zillow():
|
def test_zillow():
|
||||||
|
|
Loading…
Reference in New Issue