feat: run all 3 sites with one scrape_property() call

pull/2/head
Cullen Watson 2023-09-18 16:18:22 -05:00
parent fe351ab57c
commit c7a4bfd5e4
1 changed files with 50 additions and 15 deletions

View File

@ -1,11 +1,14 @@
import pandas as pd
from typing import Union
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from .core.scrapers import ScraperInput
from .core.scrapers.redfin import RedfinScraper from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, SiteName from .core.scrapers.models import ListingType, Property, SiteName
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType from .exceptions import InvalidSite, InvalidListingType
from typing import Union
import pandas as pd
_scrapers = { _scrapers = {
@ -91,21 +94,12 @@ def process_result(result: Property) -> pd.DataFrame:
return properties_df return properties_df
def scrape_property( def _scrape_single_site(
location: str, location: str, site_name: str, listing_type: str
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Scrape property from various sites from a given location and listing type. Helper function to scrape a single site.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
""" """
validate_input(site_name, listing_type) validate_input(site_name, listing_type)
scraper_input = ScraperInput( scraper_input = ScraperInput(
@ -122,3 +116,44 @@ def scrape_property(
return pd.DataFrame() return pd.DataFrame()
return pd.concat(properties_dfs, ignore_index=True) return pd.concat(properties_dfs, ignore_index=True)
def scrape_property(
location: str,
site_name: Union[str, list[str]],
listing_type: str = "for_sale",
) -> pd.DataFrame:
"""
Scrape property from various sites from a given location and listing type.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
"""
if site_name is None:
site_name = list(_scrapers.keys())
if not isinstance(site_name, list):
site_name = [site_name]
if len(site_name) == 1:
return _scrape_single_site(location, site_name[0], listing_type)
results = []
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
for s_name in site_name
}
for future in concurrent.futures.as_completed(futures):
result = future.result()
results.append(result)
if not results:
return pd.DataFrame()
final_df = pd.concat(results, ignore_index=True)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
return final_df