Compare commits

...

5 Commits

Author SHA1 Message Date
Zachary Hampton
aacd168545 - alt photos bug fix 2024-05-18 17:47:55 -07:00
Zachary Hampton
0d70007000 - alt photos bug fix 2024-05-16 23:04:07 -07:00
Zachary Hampton
018d3fbac4 - Python 3.9 support (tested) (could potentially work for lower versions, but I have not validated such) 2024-05-14 19:13:04 -07:00
Zachary Hampton
803fd618e9 - data cleaning & CONDOP bug fixes 2024-05-12 21:12:12 -07:00
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
8 changed files with 34 additions and 10 deletions

View File

@@ -21,7 +21,7 @@
```bash
pip install -U homeharvest
```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
_Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
## Usage

View File

@@ -53,7 +53,9 @@ def scrape_property(
if not properties_dfs:
return pd.DataFrame()
properties_dfs = [df for df in properties_dfs if not df.empty]
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""})
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
import requests
from requests.adapters import HTTPAdapter

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from typing import Optional
@@ -36,6 +37,7 @@ class PropertyType(Enum):
CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO"
CONDOP = "CONDOP"
CONDOS = "CONDOS"
COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
@@ -52,6 +54,7 @@ class PropertyType(Enum):
@dataclass
class Address:
full_line: str | None = None
street: str | None = None
unit: str | None = None
city: str | None = None

View File

@@ -4,7 +4,7 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com
"""
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import Dict, Union, Optional
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id
}
address {
line
street_direction
street_number
street_name
@@ -165,7 +166,7 @@ class RealtorScraper(Scraper):
longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
address=self._parse_address(property_info, search_type="handle_listing"),
description=Description(
alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])),
alt_photos=self.process_alt_photos(property_info["media"].get("photos", [])) if property_info.get("media") else None,
style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"),
@@ -236,6 +237,7 @@ class RealtorScraper(Scraper):
stories
}
address {
line
street_direction
street_number
street_name
@@ -352,6 +354,7 @@ class RealtorScraper(Scraper):
street_number
street_name
street_suffix
line
unit
city
state_code
@@ -657,6 +660,8 @@ class RealtorScraper(Scraper):
if not self.extra_property_data:
return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) {
__typename
@@ -765,6 +770,7 @@ class RealtorScraper(Scraper):
address = result["address"]
return Address(
full_line=address.get("line"),
street=" ".join(
part for part in [
address.get("street_number"),
@@ -799,8 +805,8 @@ class RealtorScraper(Scraper):
return Description(
primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
style=PropertyType(style) if style else None,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])),
style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"),

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
import pandas as pd
from datetime import datetime
from .core.scrapers.models import Property, ListingType, Agent
@@ -10,6 +11,7 @@ ordered_properties = [
"status",
"text",
"style",
"full_street_line",
"street",
"unit",
"city",
@@ -55,6 +57,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data:
address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city
@@ -81,7 +84,7 @@ def process_result(result: Property) -> pd.DataFrame:
description = result.description
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None
prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.22"
version = "0.3.27"
description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -10,7 +10,7 @@ readme = "README.md"
homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies]
python = ">=3.10,<3.13"
python = ">=3.9,<3.13"
requests = "^2.31.0"
pandas = "^2.1.1"

View File

@@ -155,4 +155,13 @@ def test_realtor_without_extra_details():
),
]
assert results[0] != results[1]
assert not results[0].equals(results[1])
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0