mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 03:54:29 -08:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0d70007000 | ||
|
|
018d3fbac4 | ||
|
|
803fd618e9 | ||
|
|
b23b55ca80 | ||
|
|
3458a08383 |
@@ -21,7 +21,7 @@
|
|||||||
```bash
|
```bash
|
||||||
pip install -U homeharvest
|
pip install -U homeharvest
|
||||||
```
|
```
|
||||||
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
_Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|||||||
@@ -53,7 +53,9 @@ def scrape_property(
|
|||||||
if not properties_dfs:
|
if not properties_dfs:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
properties_dfs = [df for df in properties_dfs if not df.empty]
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("ignore", category=FutureWarning)
|
warnings.simplefilter("ignore", category=FutureWarning)
|
||||||
|
|
||||||
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""})
|
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import requests
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -36,6 +37,7 @@ class PropertyType(Enum):
|
|||||||
CONDO_TOWNHOME = "CONDO_TOWNHOME"
|
CONDO_TOWNHOME = "CONDO_TOWNHOME"
|
||||||
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
|
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
|
||||||
CONDO = "CONDO"
|
CONDO = "CONDO"
|
||||||
|
CONDOP = "CONDOP"
|
||||||
CONDOS = "CONDOS"
|
CONDOS = "CONDOS"
|
||||||
COOP = "COOP"
|
COOP = "COOP"
|
||||||
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
|
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
|
||||||
@@ -52,6 +54,7 @@ class PropertyType(Enum):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Address:
|
class Address:
|
||||||
|
full_line: str | None = None
|
||||||
street: str | None = None
|
street: str | None = None
|
||||||
unit: str | None = None
|
unit: str | None = None
|
||||||
city: str | None = None
|
city: str | None = None
|
||||||
@@ -121,7 +124,8 @@ class Property:
|
|||||||
neighborhoods: Optional[str] = None
|
neighborhoods: Optional[str] = None
|
||||||
county: Optional[str] = None
|
county: Optional[str] = None
|
||||||
fips_code: Optional[str] = None
|
fips_code: Optional[str] = None
|
||||||
agents: list[Agent] = None
|
agents: list[Agent] | None = None
|
||||||
|
brokers: list[Broker] | None = None
|
||||||
nearby_schools: list[str] = None
|
nearby_schools: list[str] = None
|
||||||
assessed_value: int | None = None
|
assessed_value: int | None = None
|
||||||
estimated_value: int | None = None
|
estimated_value: int | None = None
|
||||||
|
|||||||
@@ -4,13 +4,13 @@ homeharvest.realtor.__init__
|
|||||||
|
|
||||||
This module implements the scraper for realtor.com
|
This module implements the scraper for realtor.com
|
||||||
"""
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, Union, Optional
|
from typing import Dict, Union, Optional
|
||||||
|
|
||||||
from .. import Scraper
|
from .. import Scraper
|
||||||
from ..models import Property, Address, ListingType, Description, PropertyType, Agent
|
from ..models import Property, Address, ListingType, Description, PropertyType, Agent, Broker
|
||||||
|
|
||||||
|
|
||||||
class RealtorScraper(Scraper):
|
class RealtorScraper(Scraper):
|
||||||
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
|
|||||||
listing_id
|
listing_id
|
||||||
}
|
}
|
||||||
address {
|
address {
|
||||||
|
line
|
||||||
street_direction
|
street_direction
|
||||||
street_number
|
street_number
|
||||||
street_name
|
street_name
|
||||||
@@ -165,7 +166,7 @@ class RealtorScraper(Scraper):
|
|||||||
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
|
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
|
||||||
address=self._parse_address(property_info, search_type="handle_listing"),
|
address=self._parse_address(property_info, search_type="handle_listing"),
|
||||||
description=Description(
|
description=Description(
|
||||||
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])),
|
alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
|
||||||
style=property_info["basic"].get("type", "").upper(),
|
style=property_info["basic"].get("type", "").upper(),
|
||||||
beds=property_info["basic"].get("beds"),
|
beds=property_info["basic"].get("beds"),
|
||||||
baths_full=property_info["basic"].get("baths_full"),
|
baths_full=property_info["basic"].get("baths_full"),
|
||||||
@@ -180,6 +181,7 @@ class RealtorScraper(Scraper):
|
|||||||
),
|
),
|
||||||
days_on_mls=days_on_mls,
|
days_on_mls=days_on_mls,
|
||||||
agents=prop_details.get("agents"),
|
agents=prop_details.get("agents"),
|
||||||
|
brokers=prop_details.get("brokers"),
|
||||||
nearby_schools=prop_details.get("schools"),
|
nearby_schools=prop_details.get("schools"),
|
||||||
assessed_value=prop_details.get("assessed_value"),
|
assessed_value=prop_details.get("assessed_value"),
|
||||||
estimated_value=prop_details.get("estimated_value"),
|
estimated_value=prop_details.get("estimated_value"),
|
||||||
@@ -235,6 +237,7 @@ class RealtorScraper(Scraper):
|
|||||||
stories
|
stories
|
||||||
}
|
}
|
||||||
address {
|
address {
|
||||||
|
line
|
||||||
street_direction
|
street_direction
|
||||||
street_number
|
street_number
|
||||||
street_name
|
street_name
|
||||||
@@ -295,6 +298,7 @@ class RealtorScraper(Scraper):
|
|||||||
address=self._parse_address(property_info, search_type="handle_address"),
|
address=self._parse_address(property_info, search_type="handle_address"),
|
||||||
description=self._parse_description(property_info),
|
description=self._parse_description(property_info),
|
||||||
agents=prop_details.get("agents"),
|
agents=prop_details.get("agents"),
|
||||||
|
brokers=prop_details.get("brokers"),
|
||||||
nearby_schools=prop_details.get("schools"),
|
nearby_schools=prop_details.get("schools"),
|
||||||
assessed_value=prop_details.get("assessed_value"),
|
assessed_value=prop_details.get("assessed_value"),
|
||||||
estimated_value=prop_details.get("estimated_value"),
|
estimated_value=prop_details.get("estimated_value"),
|
||||||
@@ -350,6 +354,7 @@ class RealtorScraper(Scraper):
|
|||||||
street_number
|
street_number
|
||||||
street_name
|
street_name
|
||||||
street_suffix
|
street_suffix
|
||||||
|
line
|
||||||
unit
|
unit
|
||||||
city
|
city
|
||||||
state_code
|
state_code
|
||||||
@@ -553,6 +558,7 @@ class RealtorScraper(Scraper):
|
|||||||
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
|
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
|
||||||
days_on_mls=self.calculate_days_on_mls(result),
|
days_on_mls=self.calculate_days_on_mls(result),
|
||||||
agents=prop_details.get("agents"),
|
agents=prop_details.get("agents"),
|
||||||
|
brokers=prop_details.get("brokers"),
|
||||||
nearby_schools=prop_details.get("schools"),
|
nearby_schools=prop_details.get("schools"),
|
||||||
assessed_value=prop_details.get("assessed_value"),
|
assessed_value=prop_details.get("assessed_value"),
|
||||||
estimated_value=prop_details.get("estimated_value"),
|
estimated_value=prop_details.get("estimated_value"),
|
||||||
@@ -654,6 +660,8 @@ class RealtorScraper(Scraper):
|
|||||||
if not self.extra_property_data:
|
if not self.extra_property_data:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
#: TODO: migrate "advertisers" and "estimates" to general query
|
||||||
|
|
||||||
query = """query GetHome($property_id: ID!) {
|
query = """query GetHome($property_id: ID!) {
|
||||||
home(property_id: $property_id) {
|
home(property_id: $property_id) {
|
||||||
__typename
|
__typename
|
||||||
@@ -665,7 +673,13 @@ class RealtorScraper(Scraper):
|
|||||||
email
|
email
|
||||||
phones { number type ext primary }
|
phones { number type ext primary }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
consumer_advertisers {
|
||||||
|
name
|
||||||
|
phone
|
||||||
|
href
|
||||||
|
type
|
||||||
|
}
|
||||||
|
|
||||||
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
|
nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {
|
||||||
__typename schools { district { __typename id name } }
|
__typename schools { district { __typename id name } }
|
||||||
@@ -700,7 +714,9 @@ class RealtorScraper(Scraper):
|
|||||||
except (KeyError, TypeError, IndexError):
|
except (KeyError, TypeError, IndexError):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
ads = get_key(["data", "home", "advertisers"])
|
agents = get_key(["data", "home", "advertisers"])
|
||||||
|
advertisers = get_key(["data", "home", "consumer_advertisers"])
|
||||||
|
|
||||||
schools = get_key(["data", "home", "nearbySchools", "schools"])
|
schools = get_key(["data", "home", "nearbySchools", "schools"])
|
||||||
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
|
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
|
||||||
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
|
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])
|
||||||
@@ -709,11 +725,18 @@ class RealtorScraper(Scraper):
|
|||||||
name=ad["name"],
|
name=ad["name"],
|
||||||
email=ad["email"],
|
email=ad["email"],
|
||||||
phones=ad["phones"]
|
phones=ad["phones"]
|
||||||
) for ad in ads]
|
) for ad in agents]
|
||||||
|
|
||||||
|
brokers = [Broker(
|
||||||
|
name=ad["name"],
|
||||||
|
phone=ad["phone"],
|
||||||
|
website=ad["href"]
|
||||||
|
) for ad in advertisers if ad.get("type") != "Agent"]
|
||||||
|
|
||||||
schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
|
schools = [school["district"]["name"] for school in schools if school['district'].get('name')]
|
||||||
return {
|
return {
|
||||||
"agents": agents if agents else None,
|
"agents": agents if agents else None,
|
||||||
|
"brokers": brokers if brokers else None,
|
||||||
"schools": schools if schools else None,
|
"schools": schools if schools else None,
|
||||||
"assessed_value": assessed_value if assessed_value else None,
|
"assessed_value": assessed_value if assessed_value else None,
|
||||||
"estimated_value": estimated_value if estimated_value else None,
|
"estimated_value": estimated_value if estimated_value else None,
|
||||||
@@ -747,6 +770,7 @@ class RealtorScraper(Scraper):
|
|||||||
address = result["address"]
|
address = result["address"]
|
||||||
|
|
||||||
return Address(
|
return Address(
|
||||||
|
full_line=address.get("line"),
|
||||||
street=" ".join(
|
street=" ".join(
|
||||||
part for part in [
|
part for part in [
|
||||||
address.get("street_number"),
|
address.get("street_number"),
|
||||||
@@ -781,8 +805,8 @@ class RealtorScraper(Scraper):
|
|||||||
|
|
||||||
return Description(
|
return Description(
|
||||||
primary_photo=primary_photo,
|
primary_photo=primary_photo,
|
||||||
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
|
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
|
||||||
style=PropertyType(style) if style else None,
|
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
|
||||||
beds=description_data.get("beds"),
|
beds=description_data.get("beds"),
|
||||||
baths_full=description_data.get("baths_full"),
|
baths_full=description_data.get("baths_full"),
|
||||||
baths_half=description_data.get("baths_half"),
|
baths_half=description_data.get("baths_half"),
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import annotations
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from .core.scrapers.models import Property, ListingType, Agent
|
from .core.scrapers.models import Property, ListingType, Agent
|
||||||
@@ -10,6 +11,7 @@ ordered_properties = [
|
|||||||
"status",
|
"status",
|
||||||
"text",
|
"text",
|
||||||
"style",
|
"style",
|
||||||
|
"full_street_line",
|
||||||
"street",
|
"street",
|
||||||
"unit",
|
"unit",
|
||||||
"city",
|
"city",
|
||||||
@@ -40,6 +42,9 @@ ordered_properties = [
|
|||||||
"agent",
|
"agent",
|
||||||
"agent_email",
|
"agent_email",
|
||||||
"agent_phones",
|
"agent_phones",
|
||||||
|
"broker",
|
||||||
|
"broker_phone",
|
||||||
|
"broker_website",
|
||||||
"nearby_schools",
|
"nearby_schools",
|
||||||
"primary_photo",
|
"primary_photo",
|
||||||
"alt_photos",
|
"alt_photos",
|
||||||
@@ -52,6 +57,7 @@ def process_result(result: Property) -> pd.DataFrame:
|
|||||||
|
|
||||||
if "address" in prop_data:
|
if "address" in prop_data:
|
||||||
address_data = prop_data["address"]
|
address_data = prop_data["address"]
|
||||||
|
prop_data["full_street_line"] = address_data.full_line
|
||||||
prop_data["street"] = address_data.street
|
prop_data["street"] = address_data.street
|
||||||
prop_data["unit"] = address_data.unit
|
prop_data["unit"] = address_data.unit
|
||||||
prop_data["city"] = address_data.city
|
prop_data["city"] = address_data.city
|
||||||
@@ -65,6 +71,13 @@ def process_result(result: Property) -> pd.DataFrame:
|
|||||||
prop_data["agent_email"] = agents[0].email
|
prop_data["agent_email"] = agents[0].email
|
||||||
prop_data["agent_phones"] = agents[0].phones
|
prop_data["agent_phones"] = agents[0].phones
|
||||||
|
|
||||||
|
if "brokers" in prop_data:
|
||||||
|
brokers = prop_data["brokers"]
|
||||||
|
if brokers:
|
||||||
|
prop_data["broker"] = brokers[0].name
|
||||||
|
prop_data["broker_phone"] = brokers[0].phone
|
||||||
|
prop_data["broker_website"] = brokers[0].website
|
||||||
|
|
||||||
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
|
prop_data["price_per_sqft"] = prop_data["prc_sqft"]
|
||||||
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
|
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
|
||||||
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
|
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.3.21"
|
version = "0.3.26"
|
||||||
description = "Real estate scraping library"
|
description = "Real estate scraping library"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/HomeHarvest"
|
homepage = "https://github.com/Bunsly/HomeHarvest"
|
||||||
@@ -10,7 +10,7 @@ readme = "README.md"
|
|||||||
homeharvest = "homeharvest.cli:main"
|
homeharvest = "homeharvest.cli:main"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.10,<3.13"
|
python = ">=3.9,<3.13"
|
||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
pandas = "^2.1.1"
|
pandas = "^2.1.1"
|
||||||
|
|
||||||
|
|||||||
@@ -155,4 +155,13 @@ def test_realtor_without_extra_details():
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert results[0] != results[1]
|
assert not results[0].equals(results[1])
|
||||||
|
|
||||||
|
|
||||||
|
def test_pr_zip_code():
|
||||||
|
results = scrape_property(
|
||||||
|
location="00741",
|
||||||
|
listing_type="for_sale",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results is not None and len(results) > 0
|
||||||
|
|||||||
Reference in New Issue
Block a user