Compare commits

..

3 Commits

Author SHA1 Message Date
Zachary Hampton
018d3fbac4 - Python 3.9 support (tested) (could potentially work for lower versions, but I have not validated such) 2024-05-14 19:13:04 -07:00
Zachary Hampton
803fd618e9 - data cleaning & CONDOP bug fixes 2024-05-12 21:12:12 -07:00
Zachary Hampton
b23b55ca80 - full street line (data quality improvement) 2024-05-12 18:49:44 -07:00
8 changed files with 31 additions and 7 deletions

View File

@@ -21,7 +21,7 @@
```bash ```bash
pip install -U homeharvest pip install -U homeharvest
``` ```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.9](https://www.python.org/downloads/release/python-3100/) required_
## Usage ## Usage

View File

@@ -53,7 +53,9 @@ def scrape_property(
if not properties_dfs: if not properties_dfs:
return pd.DataFrame() return pd.DataFrame()
properties_dfs = [df for df in properties_dfs if not df.empty]
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": "", None: ""}) return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Optional from typing import Optional
@@ -36,6 +37,7 @@ class PropertyType(Enum):
CONDO_TOWNHOME = "CONDO_TOWNHOME" CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP" CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO" CONDO = "CONDO"
CONDOP = "CONDOP"
CONDOS = "CONDOS" CONDOS = "CONDOS"
COOP = "COOP" COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX" DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
@@ -52,6 +54,7 @@ class PropertyType(Enum):
@dataclass @dataclass
class Address: class Address:
full_line: str | None = None
street: str | None = None street: str | None = None
unit: str | None = None unit: str | None = None
city: str | None = None city: str | None = None

View File

@@ -4,7 +4,7 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com This module implements the scraper for realtor.com
""" """
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
@@ -52,6 +52,7 @@ class RealtorScraper(Scraper):
listing_id listing_id
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -236,6 +237,7 @@ class RealtorScraper(Scraper):
stories stories
} }
address { address {
line
street_direction street_direction
street_number street_number
street_name street_name
@@ -352,6 +354,7 @@ class RealtorScraper(Scraper):
street_number street_number
street_name street_name
street_suffix street_suffix
line
unit unit
city city
state_code state_code
@@ -657,6 +660,8 @@ class RealtorScraper(Scraper):
if not self.extra_property_data: if not self.extra_property_data:
return {} return {}
#: TODO: migrate "advertisers" and "estimates" to general query
query = """query GetHome($property_id: ID!) { query = """query GetHome($property_id: ID!) {
home(property_id: $property_id) { home(property_id: $property_id) {
__typename __typename
@@ -765,6 +770,7 @@ class RealtorScraper(Scraper):
address = result["address"] address = result["address"]
return Address( return Address(
full_line=address.get("line"),
street=" ".join( street=" ".join(
part for part in [ part for part in [
address.get("street_number"), address.get("street_number"),
@@ -800,7 +806,7 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")), alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
style=PropertyType(style) if style else None, style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),

View File

@@ -1,3 +1,4 @@
from __future__ import annotations
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from .core.scrapers.models import Property, ListingType, Agent from .core.scrapers.models import Property, ListingType, Agent
@@ -10,6 +11,7 @@ ordered_properties = [
"status", "status",
"text", "text",
"style", "style",
"full_street_line",
"street", "street",
"unit", "unit",
"city", "city",
@@ -55,6 +57,7 @@ def process_result(result: Property) -> pd.DataFrame:
if "address" in prop_data: if "address" in prop_data:
address_data = prop_data["address"] address_data = prop_data["address"]
prop_data["full_street_line"] = address_data.full_line
prop_data["street"] = address_data.street prop_data["street"] = address_data.street
prop_data["unit"] = address_data.unit prop_data["unit"] = address_data.unit
prop_data["city"] = address_data.city prop_data["city"] = address_data.city

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.22" version = "0.3.25"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -10,7 +10,7 @@ readme = "README.md"
homeharvest = "homeharvest.cli:main" homeharvest = "homeharvest.cli:main"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.10,<3.13" python = ">=3.9,<3.13"
requests = "^2.31.0" requests = "^2.31.0"
pandas = "^2.1.1" pandas = "^2.1.1"

View File

@@ -155,4 +155,13 @@ def test_realtor_without_extra_details():
), ),
] ]
assert results[0] != results[1] assert not results[0].equals(results[1])
def test_pr_zip_code():
results = scrape_property(
location="00741",
listing_type="for_sale",
)
assert results is not None and len(results) > 0