parent
80186ee8c5
commit
62e3321277
|
@ -112,7 +112,9 @@ def _scrape_single_site(
|
|||
results = site.search()
|
||||
|
||||
properties_dfs = [process_result(result) for result in results]
|
||||
properties_dfs = [df.dropna(axis=1, how='all') for df in properties_dfs if not df.empty]
|
||||
properties_dfs = [
|
||||
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
|
||||
]
|
||||
if not properties_dfs:
|
||||
return pd.DataFrame()
|
||||
|
||||
|
@ -147,7 +149,9 @@ def scrape_property(
|
|||
else:
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
|
||||
executor.submit(
|
||||
_scrape_single_site, location, s_name, listing_type
|
||||
): s_name
|
||||
for s_name in site_name
|
||||
}
|
||||
|
||||
|
@ -169,5 +173,7 @@ def scrape_property(
|
|||
if col not in final_df.columns:
|
||||
final_df[col] = None
|
||||
|
||||
final_df = final_df.drop_duplicates(subset=["street_address", "city", "unit"], keep="first")
|
||||
final_df = final_df.drop_duplicates(
|
||||
subset=["street_address", "city", "unit"], keep="first"
|
||||
)
|
||||
return final_df
|
||||
|
|
|
@ -249,8 +249,20 @@ class RealtorScraper(Scraper):
|
|||
unit=parse_unit(result["location"]["address"]["unit"]),
|
||||
country="USA",
|
||||
),
|
||||
latitude=result["location"]["address"]["coordinate"]["lat"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lat" in result["location"]["address"]["coordinate"] else None,
|
||||
longitude=result["location"]["address"]["coordinate"]["lon"] if result and result.get("location") and result["location"].get("address") and result["location"]["address"].get("coordinate") and "lon" in result["location"]["address"]["coordinate"] else None,
|
||||
latitude=result["location"]["address"]["coordinate"]["lat"]
|
||||
if result
|
||||
and result.get("location")
|
||||
and result["location"].get("address")
|
||||
and result["location"]["address"].get("coordinate")
|
||||
and "lat" in result["location"]["address"]["coordinate"]
|
||||
else None,
|
||||
longitude=result["location"]["address"]["coordinate"]["lon"]
|
||||
if result
|
||||
and result.get("location")
|
||||
and result["location"].get("address")
|
||||
and result["location"]["address"].get("coordinate")
|
||||
and "lon" in result["location"]["address"]["coordinate"]
|
||||
else None,
|
||||
site_name=self.site_name,
|
||||
property_url="https://www.realtor.com/realestateandhomes-detail/"
|
||||
+ result["property_id"],
|
||||
|
|
|
@ -27,8 +27,10 @@ class RedfinScraper(Scraper):
|
|||
elif match_type == "1":
|
||||
return "address" #: address, needs to be handled differently
|
||||
|
||||
if "exactMatch" not in response_json['payload']:
|
||||
raise NoResultsFound("No results found for location: {}".format(self.location))
|
||||
if "exactMatch" not in response_json["payload"]:
|
||||
raise NoResultsFound(
|
||||
"No results found for location: {}".format(self.location)
|
||||
)
|
||||
|
||||
if response_json["payload"]["exactMatch"] is not None:
|
||||
target = response_json["payload"]["exactMatch"]
|
||||
|
@ -98,8 +100,12 @@ class RedfinScraper(Scraper):
|
|||
price_per_sqft=get_value("pricePerSqFt"),
|
||||
price=get_value("price"),
|
||||
mls_id=get_value("mlsId"),
|
||||
latitude=home["latLong"]["latitude"] if "latLong" in home and "latitude" in home["latLong"] else None,
|
||||
longitude = home["latLong"]["longitude"] if "latLong" in home and "longitude" in home["latLong"] else None
|
||||
latitude=home["latLong"]["latitude"]
|
||||
if "latLong" in home and "latitude" in home["latLong"]
|
||||
else None,
|
||||
longitude=home["latLong"]["longitude"]
|
||||
if "latLong" in home and "longitude" in home["latLong"]
|
||||
else None,
|
||||
)
|
||||
|
||||
def _parse_building(self, building: dict) -> Property:
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
import re
|
||||
import json
|
||||
import string
|
||||
from .. import Scraper
|
||||
from ....utils import parse_address_two, parse_unit
|
||||
from ....exceptions import GeoCoordsNotFound, NoResultsFound
|
||||
from ..models import Property, Address, ListingType, PropertyType, SiteName
|
||||
from ..models import Property, Address, ListingType, PropertyType
|
||||
|
||||
|
||||
class ZillowScraper(Scraper):
|
||||
def __init__(self, scraper_input):
|
||||
super().__init__(scraper_input)
|
||||
self.listing_type = scraper_input.listing_type
|
||||
if not self.is_plausible_location(self.location):
|
||||
raise NoResultsFound("Invalid location input: {}".format(self.location))
|
||||
if self.listing_type == ListingType.FOR_SALE:
|
||||
self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/"
|
||||
elif self.listing_type == ListingType.FOR_RENT:
|
||||
|
@ -17,6 +20,18 @@ class ZillowScraper(Scraper):
|
|||
else:
|
||||
self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/"
|
||||
|
||||
@staticmethod
|
||||
def is_plausible_location(location: str) -> bool:
|
||||
blocks = location.split()
|
||||
for block in blocks:
|
||||
if (
|
||||
any(char.isdigit() for char in block)
|
||||
and any(char.isalpha() for char in block)
|
||||
and len(block) > 6
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
def search(self):
|
||||
resp = self.session.get(self.url, headers=self._get_headers())
|
||||
resp.raise_for_status()
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
from homeharvest import scrape_property
|
||||
from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound
|
||||
from homeharvest.exceptions import (
|
||||
InvalidSite,
|
||||
InvalidListingType,
|
||||
NoResultsFound,
|
||||
GeoCoordsNotFound,
|
||||
)
|
||||
|
||||
|
||||
def test_realtor():
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
from homeharvest import scrape_property
|
||||
from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound
|
||||
from homeharvest.exceptions import (
|
||||
InvalidSite,
|
||||
InvalidListingType,
|
||||
NoResultsFound,
|
||||
GeoCoordsNotFound,
|
||||
)
|
||||
|
||||
|
||||
def test_redfin():
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
from homeharvest import scrape_property
|
||||
from homeharvest.exceptions import InvalidSite, InvalidListingType, NoResultsFound, GeoCoordsNotFound
|
||||
from homeharvest.exceptions import (
|
||||
InvalidSite,
|
||||
InvalidListingType,
|
||||
NoResultsFound,
|
||||
GeoCoordsNotFound,
|
||||
)
|
||||
|
||||
|
||||
def test_zillow():
|
||||
|
|
Loading…
Reference in New Issue