Compare commits

...

8 Commits

Author SHA1 Message Date
Cullen Watson
7246703999 Schools (#69) 2024-04-16 20:01:20 -05:00
Cullen Watson
6076b0f961 enh: add agent (#68) 2024-04-16 15:09:32 -05:00
Cullen Watson
cdc6f2a2a8 docs: readme 2024-04-16 14:59:50 -05:00
Cullen Watson
0bdf56568e enh: add agent name/phone (#66) 2024-04-16 14:55:44 -05:00
Cullen Watson
1f47fc3b7e fix: use enum value (#65) 2024-04-12 01:41:15 -05:00
Zachary Hampton
5c2498c62b - pending date, property type fields (#45)
- alt photos bug fix (#57)
2024-03-13 19:17:17 -07:00
Zachary Hampton
d775540afd - location bug fix 2024-03-06 16:31:06 -07:00
Cullen Watson
5ea9a6f6b6 docs: readme 2024-03-03 11:49:27 -06:00
13 changed files with 487 additions and 194 deletions

View File

@@ -30,4 +30,4 @@ jobs:
if: startsWith(github.ref, 'refs/tags') if: startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1 uses: pypa/gh-action-pypi-publish@release/v1
with: with:
password: ${{ secrets.PYPI_API_TOKEN }} password: ${{ secrets.PYPI_API_TOKEN }}

2
.gitignore vendored
View File

@@ -4,4 +4,4 @@
**/.pytest_cache/ **/.pytest_cache/
*.pyc *.pyc
/.ipynb_checkpoints/ /.ipynb_checkpoints/
*.csv *.csv

21
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,21 @@
---
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-added-large-files
- id: check-yaml
- repo: https://github.com/adrienverge/yamllint
rev: v1.29.0
hooks:
- id: yamllint
verbose: true # create awareness of linter findings
args: ["-d", "{extends: relaxed, rules: {line-length: {max: 120}}}"]
- repo: https://github.com/psf/black
rev: 24.2.0
hooks:
- id: black
language_version: python
args: [--line-length=120, --quiet]

View File

@@ -11,8 +11,6 @@
- **Source**: Fetches properties directly from **Realtor.com**. - **Source**: Fetches properties directly from **Realtor.com**.
- **Data Format**: Structures data to resemble MLS listings. - **Data Format**: Structures data to resemble MLS listings.
- **Export Flexibility**: Options to save as either CSV or Excel. - **Export Flexibility**: Options to save as either CSV or Excel.
- **Usage Modes**:
- **Python**: For those who'd like to integrate scraping into their Python scripts.
[Video Guide for HomeHarvest](https://youtu.be/J1qgNPgmSLI) - _updated for release v0.3.4_ [Video Guide for HomeHarvest](https://youtu.be/J1qgNPgmSLI) - _updated for release v0.3.4_
@@ -21,9 +19,9 @@
## Installation ## Installation
```bash ```bash
pip install homeharvest pip install -U homeharvest
``` ```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
## Usage ## Usage
@@ -41,11 +39,11 @@ properties = scrape_property(
location="San Diego, CA", location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent, pending) listing_type="sold", # or (for_sale, for_rent, pending)
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent) past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
# date_from="2023-05-01", # alternative to past_days # date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28", # date_to="2023-05-28",
# foreclosure=True # foreclosure=True
# mls_only=True, # only fetch MLS listings # mls_only=True, # only fetch MLS listings
) )
print(f"Number of properties: {len(properties)}") print(f"Number of properties: {len(properties)}")
@@ -86,7 +84,7 @@ Optional
├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required. ├── date_from, date_to (string): Start and end dates to filter properties listed or sold, both dates are required.
| (use this to get properties in chunks as there's a 10k result limit) | (use this to get properties in chunks as there's a 10k result limit)
│ Format for both must be "YYYY-MM-DD". │ Format for both must be "YYYY-MM-DD".
│ Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates) │ Example: "2023-05-01", "2023-05-15" (fetches properties listed/sold between these dates)
├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) ├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
@@ -126,17 +124,28 @@ Property
│ ├── days_on_mls │ ├── days_on_mls
│ ├── list_price │ ├── list_price
│ ├── list_date │ ├── list_date
│ ├── pending_date
│ ├── sold_price │ ├── sold_price
│ ├── last_sold_date │ ├── last_sold_date
│ ├── price_per_sqft │ ├── price_per_sqft
│ ├── parking_garage
│ └── hoa_fee │ └── hoa_fee
├── Location Details: ├── Location Details:
│ ├── latitude │ ├── latitude
│ ├── longitude │ ├── longitude
│ ├── nearby_schools
└── Parking Details:
└── parking_garage ├── Agent Info:
│ ├── agent
│ ├── broker
│ └── broker_phone
├── Agent Info:
│ ├── agent
│ ├── broker
│ └── broker_phone
``` ```
### Exceptions ### Exceptions
@@ -144,4 +153,3 @@ The following exceptions may be raised when using HomeHarvest:
- `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold` - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
- `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD - `InvalidDate` - date_from or date_to is not in the format YYYY-MM-DD

View File

@@ -5,9 +5,7 @@ from homeharvest import scrape_property
def main(): def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper") parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument( parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
)
parser.add_argument( parser.add_argument(
"-l", "-l",
@@ -35,9 +33,7 @@ def main():
help="Name of the output file (without extension)", help="Name of the output file (without extension)",
) )
parser.add_argument( parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
)
parser.add_argument( parser.add_argument(
"-d", "-d",
"--days", "--days",

View File

@@ -1,5 +1,7 @@
import uuid
from dataclasses import dataclass from dataclasses import dataclass
import requests import requests
import uuid
from .models import Property, ListingType, SiteName from .models import Property, ListingType, SiteName
@@ -27,6 +29,12 @@ class Scraper:
if not session: if not session:
self.session = requests.Session() self.session = requests.Session()
self.session.headers.update(
{
"auth": f"Bearer {self.get_access_token()}",
"apollographql-client-name": "com.move.Realtor-apollo-ios",
}
)
else: else:
self.session = session self.session = session
@@ -43,12 +51,26 @@ class Scraper:
self.date_to = scraper_input.date_to self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure self.foreclosure = scraper_input.foreclosure
def search(self) -> list[Property]: def search(self) -> list[Property]: ...
...
@staticmethod @staticmethod
def _parse_home(home) -> Property: def _parse_home(home) -> Property: ...
...
def handle_location(self): def handle_location(self): ...
...
def get_access_token(self):
url = "https://graph.realtor.com/auth/token"
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}'
headers = {
"Host": "graph.realtor.com",
"x-client-version": "24.20.4.149916",
"accept": "*/*",
"content-type": "Application/json",
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0",
"accept-language": "en-US,en;q=0.9",
}
response = requests.post(url, headers=headers, data=payload)
data = response.json()
return data["access_token"]

View File

@@ -23,6 +23,33 @@ class ListingType(Enum):
SOLD = "SOLD" SOLD = "SOLD"
@dataclass
class Agent:
name: str | None = None
phone: str | None = None
class PropertyType(Enum):
APARTMENT = "APARTMENT"
BUILDING = "BUILDING"
COMMERCIAL = "COMMERCIAL"
CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO"
CONDOS = "CONDOS"
COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
FARM = "FARM"
INVESTMENT = "INVESTMENT"
LAND = "LAND"
MOBILE = "MOBILE"
MULTI_FAMILY = "MULTI_FAMILY"
RENTAL = "RENTAL"
SINGLE_FAMILY = "SINGLE_FAMILY"
TOWNHOMES = "TOWNHOMES"
OTHER = "OTHER"
@dataclass @dataclass
class Address: class Address:
street: str | None = None street: str | None = None
@@ -36,7 +63,7 @@ class Address:
class Description: class Description:
primary_photo: str | None = None primary_photo: str | None = None
alt_photos: list[str] | None = None alt_photos: list[str] | None = None
style: str | None = None style: PropertyType | None = None
beds: int | None = None beds: int | None = None
baths_full: int | None = None baths_full: int | None = None
baths_half: int | None = None baths_half: int | None = None
@@ -48,6 +75,12 @@ class Description:
stories: int | None = None stories: int | None = None
@dataclass
class Agent:
name: str | None = None
phone: str | None = None
@dataclass @dataclass
class Property: class Property:
property_url: str property_url: str
@@ -58,6 +91,7 @@ class Property:
list_price: int | None = None list_price: int | None = None
list_date: str | None = None list_date: str | None = None
pending_date: str | None = None
last_sold_date: str | None = None last_sold_date: str | None = None
prc_sqft: int | None = None prc_sqft: int | None = None
hoa_fee: int | None = None hoa_fee: int | None = None
@@ -67,3 +101,6 @@ class Property:
latitude: float | None = None latitude: float | None = None
longitude: float | None = None longitude: float | None = None
neighborhoods: Optional[str] = None neighborhoods: Optional[str] = None
agents: list[Agent] = None
nearby_schools: list[str] = None

View File

@@ -4,18 +4,21 @@ homeharvest.realtor.__init__
This module implements the scraper for realtor.com This module implements the scraper for realtor.com
""" """
from datetime import datetime from datetime import datetime
from typing import Dict, Union, Optional from typing import Dict, Union, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .. import Scraper from .. import Scraper
from ..models import Property, Address, ListingType, Description from ..models import Property, Address, ListingType, Description, PropertyType, Agent
class RealtorScraper(Scraper): class RealtorScraper(Scraper):
SEARCH_GQL_URL = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta" SEARCH_GQL_URL = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
PROPERTY_URL = "https://www.realtor.com/realestateandhomes-detail/" PROPERTY_URL = "https://www.realtor.com/realestateandhomes-detail/"
PROPERTY_GQL = "https://graph.realtor.com/graphql"
ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest" ADDRESS_AUTOCOMPLETE_URL = "https://parser-external.geo.moveaws.com/suggest"
NUM_PROPERTY_WORKERS = 20
def __init__(self, scraper_input): def __init__(self, scraper_input):
super().__init__(scraper_input) super().__init__(scraper_input)
@@ -84,11 +87,10 @@ class RealtorScraper(Scraper):
garage garage
permalink permalink
} }
primary_photo { media {
href photos {
} href
photos { }
href
} }
} }
}""" }"""
@@ -111,18 +113,22 @@ class RealtorScraper(Scraper):
) )
able_to_get_lat_long = ( able_to_get_lat_long = (
property_info property_info
and property_info.get("address") and property_info.get("address")
and property_info["address"].get("location") and property_info["address"].get("location")
and property_info["address"]["location"].get("coordinate") and property_info["address"]["location"].get("coordinate")
) )
list_date_str = property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get( list_date_str = (
"list_date") else None property_info["basic"]["list_date"].split("T")[0] if property_info["basic"].get("list_date") else None
last_sold_date_str = property_info["basic"]["sold_date"].split("T")[0] if property_info["basic"].get( )
"sold_date") else None last_sold_date_str = (
property_info["basic"]["sold_date"].split("T")[0] if property_info["basic"].get("sold_date") else None
)
pending_date_str = property_info["pending_date"].split("T")[0] if property_info.get("pending_date") else None
list_date = datetime.strptime(list_date_str, "%Y-%m-%d") if list_date_str else None list_date = datetime.strptime(list_date_str, "%Y-%m-%d") if list_date_str else None
last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None last_sold_date = datetime.strptime(last_sold_date_str, "%Y-%m-%d") if last_sold_date_str else None
pending_date = datetime.strptime(pending_date_str, "%Y-%m-%d") if pending_date_str else None
today = datetime.now() today = datetime.now()
days_on_mls = None days_on_mls = None
@@ -130,36 +136,36 @@ class RealtorScraper(Scraper):
if list_date: if list_date:
if status == "sold" and last_sold_date: if status == "sold" and last_sold_date:
days_on_mls = (last_sold_date - list_date).days days_on_mls = (last_sold_date - list_date).days
elif status in ('for_sale', 'for_rent'): elif status in ("for_sale", "for_rent"):
days_on_mls = (today - list_date).days days_on_mls = (today - list_date).days
if days_on_mls and days_on_mls < 0: if days_on_mls and days_on_mls < 0:
days_on_mls = None days_on_mls = None
property_id = property_info["details"]["permalink"]
agents_schools = self.get_agents_schools(property_id)
listing = Property( listing = Property(
mls=mls, mls=mls,
mls_id=property_info["source"].get("listing_id") mls_id=(
if "source" in property_info and isinstance(property_info["source"], dict) property_info["source"].get("listing_id")
else None, if "source" in property_info and isinstance(property_info["source"], dict)
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}", else None
),
property_url=f"{self.PROPERTY_URL}{property_id}",
status=property_info["basic"]["status"].upper(), status=property_info["basic"]["status"].upper(),
list_price=property_info["basic"]["price"], list_price=property_info["basic"]["price"],
list_date=list_date, list_date=list_date,
prc_sqft=property_info["basic"].get("price") prc_sqft=(
/ property_info["basic"].get("sqft") property_info["basic"].get("price") / property_info["basic"].get("sqft")
if property_info["basic"].get("price") if property_info["basic"].get("price") and property_info["basic"].get("sqft")
and property_info["basic"].get("sqft") else None
else None, ),
last_sold_date=last_sold_date, last_sold_date=last_sold_date,
latitude=property_info["address"]["location"]["coordinate"].get("lat") pending_date=pending_date,
if able_to_get_lat_long latitude=property_info["address"]["location"]["coordinate"].get("lat") if able_to_get_lat_long else None,
else None, longitude=property_info["address"]["location"]["coordinate"].get("lon") if able_to_get_lat_long else None,
longitude=property_info["address"]["location"]["coordinate"].get("lon")
if able_to_get_lat_long
else None,
address=self._parse_address(property_info, search_type="handle_listing"), address=self._parse_address(property_info, search_type="handle_listing"),
description=Description( description=Description(
primary_photo=property_info["primary_photo"].get("href", "").replace("s.jpg", "od-w480_h360_x2.webp?w=1080&q=75"), alt_photos=self.process_alt_photos(property_info.get("media", {}).get("photos", [])),
alt_photos=self.process_alt_photos(property_info.get("photos", [])),
style=property_info["basic"].get("type", "").upper(), style=property_info["basic"].get("type", "").upper(),
beds=property_info["basic"].get("beds"), beds=property_info["basic"].get("beds"),
baths_full=property_info["basic"].get("baths_full"), baths_full=property_info["basic"].get("baths_full"),
@@ -171,7 +177,9 @@ class RealtorScraper(Scraper):
garage=property_info["details"].get("garage"), garage=property_info["details"].get("garage"),
stories=property_info["details"].get("stories"), stories=property_info["details"].get("stories"),
), ),
days_on_mls=days_on_mls days_on_mls=days_on_mls,
agents=agents_schools["agents"],
nearby_schools=agents_schools["schools"],
) )
return [listing] return [listing]
@@ -265,6 +273,7 @@ class RealtorScraper(Scraper):
}""" }"""
variables = {"property_id": property_id} variables = {"property_id": property_id}
agents_schools = self.get_agents_schools(property_id)
payload = { payload = {
"query": query, "query": query,
@@ -280,16 +289,14 @@ class RealtorScraper(Scraper):
Property( Property(
mls_id=property_id, mls_id=property_id,
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}", property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
address=self._parse_address( address=self._parse_address(property_info, search_type="handle_address"),
property_info, search_type="handle_address"
),
description=self._parse_description(property_info), description=self._parse_description(property_info),
agents=agents_schools["agents"],
nearby_schools=agents_schools["schools"],
) )
] ]
def general_search( def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[int, list[Property]]]:
self, variables: dict, search_type: str
) -> Dict[str, Union[int, list[Property]]]:
""" """
Handles a location area & returns a list of properties Handles a location area & returns a list of properties
""" """
@@ -297,6 +304,7 @@ class RealtorScraper(Scraper):
count count
total total
results { results {
pending_date
property_id property_id
list_date list_date
status status
@@ -309,6 +317,7 @@ class RealtorScraper(Scraper):
is_pending is_pending
} }
description { description {
type
sqft sqft
beds beds
baths_full baths_full
@@ -377,20 +386,17 @@ class RealtorScraper(Scraper):
) )
pending_or_contingent_param = ( pending_or_contingent_param = (
"or_filters: { contingent: true, pending: true }" "or_filters: { contingent: true, pending: true }" if self.listing_type == ListingType.PENDING else ""
if self.listing_type == ListingType.PENDING
else ""
) )
listing_type = ListingType.FOR_SALE if self.listing_type == ListingType.PENDING else self.listing_type listing_type = ListingType.FOR_SALE if self.listing_type == ListingType.PENDING else self.listing_type
is_foreclosure = "" is_foreclosure = ""
if 'foreclosure' in variables and variables['foreclosure'] == True: if variables.get("foreclosure") is True:
is_foreclosure = "foreclosure: true" is_foreclosure = "foreclosure: true"
elif variables.get("foreclosure") is False:
if 'foreclosure' in variables and variables['foreclosure'] == False: is_foreclosure = "foreclosure: false"
is_foreclosure = "foreclosure: false"
if search_type == "comps": #: comps search, came from an address if search_type == "comps": #: comps search, came from an address
query = """query Property_search( query = """query Property_search(
$coordinates: [Float]! $coordinates: [Float]!
@@ -398,11 +404,11 @@ class RealtorScraper(Scraper):
$offset: Int!, $offset: Int!,
) { ) {
home_search( home_search(
query: { query: {
%s %s
nearby: { nearby: {
coordinates: $coordinates coordinates: $coordinates
radius: $radius radius: $radius
} }
status: %s status: %s
%s %s
@@ -412,7 +418,7 @@ class RealtorScraper(Scraper):
limit: 200 limit: 200
offset: $offset offset: $offset
) %s""" % ( ) %s""" % (
is_foreclosure, is_foreclosure,
listing_type.value.lower(), listing_type.value.lower(),
date_param, date_param,
pending_or_contingent_param, pending_or_contingent_param,
@@ -487,15 +493,11 @@ class RealtorScraper(Scraper):
): ):
return {"total": 0, "properties": []} return {"total": 0, "properties": []}
for result in response_json["data"][search_key]["results"]: def process_property(result: dict) -> Property | None:
mls = ( mls = result["source"].get("id") if "source" in result and isinstance(result["source"], dict) else None
result["source"].get("id")
if "source" in result and isinstance(result["source"], dict)
else None
)
if not mls and self.mls_only: if not mls and self.mls_only:
continue return
able_to_get_lat_long = ( able_to_get_lat_long = (
result result
@@ -507,35 +509,48 @@ class RealtorScraper(Scraper):
is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent") is_pending = result["flags"].get("is_pending") or result["flags"].get("is_contingent")
if is_pending and self.listing_type != ListingType.PENDING: if is_pending and self.listing_type != ListingType.PENDING:
continue return
property_id = result["property_id"]
agents_schools = self.get_agents_schools(property_id)
realty_property = Property( realty_property = Property(
mls=mls, mls=mls,
mls_id=result["source"].get("listing_id") mls_id=(
if "source" in result and isinstance(result["source"], dict) result["source"].get("listing_id")
else None, if "source" in result and isinstance(result["source"], dict)
property_url=f"{self.PROPERTY_URL}{result['property_id']}" if self.listing_type != ListingType.FOR_RENT else f"{self.PROPERTY_URL}M{result['property_id']}?listing_status=rental", else None
),
property_url=(
f"{self.PROPERTY_URL}{property_id}"
if self.listing_type != ListingType.FOR_RENT
else f"{self.PROPERTY_URL}M{property_id}?listing_status=rental"
),
status="PENDING" if is_pending else result["status"].upper(), status="PENDING" if is_pending else result["status"].upper(),
list_price=result["list_price"], list_price=result["list_price"],
list_date=result["list_date"].split("T")[0] list_date=result["list_date"].split("T")[0] if result.get("list_date") else None,
if result.get("list_date")
else None,
prc_sqft=result.get("price_per_sqft"), prc_sqft=result.get("price_per_sqft"),
last_sold_date=result.get("last_sold_date"), last_sold_date=result.get("last_sold_date"),
hoa_fee=result["hoa"]["fee"] hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None,
if result.get("hoa") and isinstance(result["hoa"], dict) latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None,
else None, longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None,
latitude=result["location"]["address"]["coordinate"].get("lat")
if able_to_get_lat_long
else None,
longitude=result["location"]["address"]["coordinate"].get("lon")
if able_to_get_lat_long
else None,
address=self._parse_address(result, search_type="general_search"), address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result), description=self._parse_description(result),
days_on_mls=self.calculate_days_on_mls(result) days_on_mls=self.calculate_days_on_mls(result),
agents=agents_schools["agents"],
nearby_schools=agents_schools["schools"],
) )
properties.append(realty_property) return realty_property
with ThreadPoolExecutor(max_workers=self.NUM_PROPERTY_WORKERS) as executor:
futures = [
executor.submit(process_property, result) for result in response_json["data"][search_key]["results"]
]
for future in as_completed(futures):
result = future.result()
if result:
properties.append(result)
return { return {
"total": response_json["data"][search_key]["total"], "total": response_json["data"][search_key]["total"],
@@ -552,22 +567,18 @@ class RealtorScraper(Scraper):
search_variables = { search_variables = {
"offset": 0, "offset": 0,
} }
search_type = ( search_type = (
"comps" "comps"
if self.radius and location_type == "address" if self.radius and location_type == "address"
else "address" else "address" if location_type == "address" and not self.radius else "area"
if location_type == "address" and not self.radius
else "area"
) )
if location_type == "address": if location_type == "address":
if not self.radius: #: single address search, non comps if not self.radius: #: single address search, non comps
property_id = location_info["mpr_id"] property_id = location_info["mpr_id"]
search_variables |= {"property_id": property_id} search_variables |= {"property_id": property_id}
gql_results = self.general_search( gql_results = self.general_search(search_variables, search_type=search_type)
search_variables, search_type=search_type
)
if gql_results["total"] == 0: if gql_results["total"] == 0:
listing_id = self.get_latest_listing_id(property_id) listing_id = self.get_latest_listing_id(property_id)
if listing_id is None: if listing_id is None:
@@ -578,6 +589,9 @@ class RealtorScraper(Scraper):
return gql_results["properties"] return gql_results["properties"]
else: #: general search, comps (radius) else: #: general search, comps (radius)
if not location_info.get("centroid"):
return []
coordinates = list(location_info["centroid"].values()) coordinates = list(location_info["centroid"].values())
search_variables |= { search_variables |= {
"coordinates": coordinates, "coordinates": coordinates,
@@ -597,8 +611,8 @@ class RealtorScraper(Scraper):
"postal_code": location_info.get("postal_code"), "postal_code": location_info.get("postal_code"),
} }
if self.foreclosure: if self.foreclosure:
search_variables['foreclosure'] = self.foreclosure search_variables["foreclosure"] = self.foreclosure
result = self.general_search(search_variables, search_type=search_type) result = self.general_search(search_variables, search_type=search_type)
total = result["total"] total = result["total"]
@@ -619,6 +633,26 @@ class RealtorScraper(Scraper):
return homes return homes
def get_agents_schools(self, property_id: str) -> dict:
payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {{ __typename schools {{ district {{ __typename id name }} }} }}}}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}'
response = self.session.post(self.PROPERTY_GQL, data=payload)
def get_key(keys: list):
try:
data = response.json()
for key in keys:
data = data[key]
return data
except (KeyError, TypeError):
return []
ads = get_key(["data", "home", "consumerAdvertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"])
agents = [Agent(name=ad["name"], phone=ad["phone"]) for ad in ads]
schools = [school["district"]["name"] for school in schools]
return {"agents": agents, "schools": schools}
@staticmethod @staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]: def _parse_neighborhoods(result: dict) -> Optional[str]:
neighborhoods_list = [] neighborhoods_list = []
@@ -641,17 +675,19 @@ class RealtorScraper(Scraper):
def _parse_address(self, result: dict, search_type): def _parse_address(self, result: dict, search_type):
if search_type == "general_search": if search_type == "general_search":
address = result['location']['address'] address = result["location"]["address"]
else: else:
address = result["address"] address = result["address"]
return Address( return Address(
street=" ".join([ street=" ".join(
self.handle_none_safely(address.get('street_number')), [
self.handle_none_safely(address.get('street_direction')), self.handle_none_safely(address.get("street_number")),
self.handle_none_safely(address.get('street_name')), self.handle_none_safely(address.get("street_direction")),
self.handle_none_safely(address.get('street_suffix')), self.handle_none_safely(address.get("street_name")),
]).strip(), self.handle_none_safely(address.get("street_suffix")),
]
).strip(),
unit=address["unit"], unit=address["unit"],
city=address["city"], city=address["city"],
state=address["state_code"], state=address["state_code"],
@@ -660,7 +696,6 @@ class RealtorScraper(Scraper):
@staticmethod @staticmethod
def _parse_description(result: dict) -> Description: def _parse_description(result: dict) -> Description:
description_data = result.get("description", {}) description_data = result.get("description", {})
if description_data is None or not isinstance(description_data, dict): if description_data is None or not isinstance(description_data, dict):
@@ -680,7 +715,7 @@ class RealtorScraper(Scraper):
return Description( return Description(
primary_photo=primary_photo, primary_photo=primary_photo,
alt_photos=RealtorScraper.process_alt_photos(result.get("photos")), alt_photos=RealtorScraper.process_alt_photos(result.get("photos")),
style=style, style=PropertyType(style) if style else None,
beds=description_data.get("beds"), beds=description_data.get("beds"),
baths_full=description_data.get("baths_full"), baths_full=description_data.get("baths_full"),
baths_half=description_data.get("baths_half"), baths_half=description_data.get("baths_half"),
@@ -692,7 +727,6 @@ class RealtorScraper(Scraper):
stories=description_data.get("stories"), stories=description_data.get("stories"),
) )
@staticmethod @staticmethod
def calculate_days_on_mls(result: dict) -> Optional[int]: def calculate_days_on_mls(result: dict) -> Optional[int]:
list_date_str = result.get("list_date") list_date_str = result.get("list_date")
@@ -702,12 +736,12 @@ class RealtorScraper(Scraper):
today = datetime.now() today = datetime.now()
if list_date: if list_date:
if result["status"] == 'sold': if result["status"] == "sold":
if last_sold_date: if last_sold_date:
days = (last_sold_date - list_date).days days = (last_sold_date - list_date).days
if days >= 0: if days >= 0:
return days return days
elif result["status"] in ('for_sale', 'for_rent'): elif result["status"] in ("for_sale", "for_rent"):
days = (today - list_date).days days = (today - list_date).days
if days >= 0: if days >= 0:
return days return days

View File

@@ -1,5 +1,6 @@
class InvalidListingType(Exception): class InvalidListingType(Exception):
"""Raised when a provided listing type is does not exist.""" """Raised when a provided listing type is does not exist."""
class InvalidDate(Exception): class InvalidDate(Exception):
"""Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23 """ """Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23"""

View File

@@ -31,6 +31,10 @@ ordered_properties = [
"stories", "stories",
"hoa_fee", "hoa_fee",
"parking_garage", "parking_garage",
"agent",
"broker",
"broker_phone",
"nearby_schools",
"primary_photo", "primary_photo",
"alt_photos", "alt_photos",
] ]
@@ -48,12 +52,22 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["state"] = address_data.state prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip prop_data["zip_code"] = address_data.zip
if "agents" in prop_data:
agents = prop_data["agents"]
if agents:
prop_data["agent"] = agents[0].name
if len(agents) > 1:
prop_data["broker"] = agents[1].name
prop_data["broker_phone"] = agents[1].phone
prop_data["price_per_sqft"] = prop_data["prc_sqft"] prop_data["price_per_sqft"] = prop_data["prc_sqft"]
prop_data["nearby_schools"] = filter(None, prop_data["nearby_schools"]) if prop_data["nearby_schools"] else None
prop_data["nearby_schools"] = ", ".join(set(prop_data["nearby_schools"])) if prop_data["nearby_schools"] else None
description = result.description description = result.description
prop_data["primary_photo"] = description.primary_photo prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos) prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["style"] = description.style prop_data["style"] = description.style.value
prop_data["beds"] = description.beds prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half prop_data["half_baths"] = description.baths_half
@@ -72,9 +86,7 @@ def process_result(result: Property) -> pd.DataFrame:
def validate_input(listing_type: str) -> None: def validate_input(listing_type: str) -> None:
if listing_type.upper() not in ListingType.__members__: if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType( raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
f"Provided listing type, '{listing_type}', does not exist."
)
def validate_dates(date_from: str | None, date_to: str | None) -> None: def validate_dates(date_from: str | None, date_to: str | None) -> None:

214
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
[[package]] [[package]]
name = "certifi" name = "certifi"
@@ -11,6 +11,17 @@ files = [
{file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
] ]
[[package]]
name = "cfgv"
version = "3.4.0"
description = "Validate configuration and produce human readable error messages."
optional = false
python-versions = ">=3.8"
files = [
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
]
[[package]] [[package]]
name = "charset-normalizer" name = "charset-normalizer"
version = "3.3.0" version = "3.3.0"
@@ -122,14 +133,14 @@ files = [
] ]
[[package]] [[package]]
name = "et-xmlfile" name = "distlib"
version = "1.1.0" version = "0.3.8"
description = "An implementation of lxml.xmlfile for the standard library" description = "Distribution utilities"
optional = false optional = false
python-versions = ">=3.6" python-versions = "*"
files = [ files = [
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"},
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
] ]
[[package]] [[package]]
@@ -146,6 +157,36 @@ files = [
[package.extras] [package.extras]
test = ["pytest (>=6)"] test = ["pytest (>=6)"]
[[package]]
name = "filelock"
version = "3.13.4"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.8"
files = [
{file = "filelock-3.13.4-py3-none-any.whl", hash = "sha256:404e5e9253aa60ad457cae1be07c0f0ca90a63931200a47d9b6a6af84fd7b45f"},
{file = "filelock-3.13.4.tar.gz", hash = "sha256:d13f466618bfde72bd2c18255e269f72542c6e70e7bac83a0232d6b1cc5c8cf4"},
]
[package.extras]
docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
typing = ["typing-extensions (>=4.8)"]
[[package]]
name = "identify"
version = "2.5.35"
description = "File identification library for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "identify-2.5.35-py2.py3-none-any.whl", hash = "sha256:c4de0081837b211594f8e877a6b4fad7ca32bbfc1a9307fdd61c28bfe923f13e"},
{file = "identify-2.5.35.tar.gz", hash = "sha256:10a7ca245cfcd756a554a7288159f72ff105ad233c7c4b9c6f0f4d108f5f6791"},
]
[package.extras]
license = ["ukkonen"]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.4" version = "3.4"
@@ -168,6 +209,20 @@ files = [
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
] ]
[[package]]
name = "nodeenv"
version = "1.8.0"
description = "Node.js virtual environment builder"
optional = false
python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
files = [
{file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"},
{file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"},
]
[package.dependencies]
setuptools = "*"
[[package]] [[package]]
name = "numpy" name = "numpy"
version = "1.26.0" version = "1.26.0"
@@ -209,20 +264,6 @@ files = [
{file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"}, {file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
] ]
[[package]]
name = "openpyxl"
version = "3.1.2"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
optional = false
python-versions = ">=3.6"
files = [
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]] [[package]]
name = "packaging" name = "packaging"
version = "23.2" version = "23.2"
@@ -302,6 +343,21 @@ sql-other = ["SQLAlchemy (>=1.4.36)"]
test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
xml = ["lxml (>=4.8.0)"] xml = ["lxml (>=4.8.0)"]
[[package]]
name = "platformdirs"
version = "4.2.0"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
optional = false
python-versions = ">=3.8"
files = [
{file = "platformdirs-4.2.0-py3-none-any.whl", hash = "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068"},
{file = "platformdirs-4.2.0.tar.gz", hash = "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768"},
]
[package.extras]
docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"]
[[package]] [[package]]
name = "pluggy" name = "pluggy"
version = "1.3.0" version = "1.3.0"
@@ -317,6 +373,24 @@ files = [
dev = ["pre-commit", "tox"] dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"] testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pre-commit"
version = "3.7.0"
description = "A framework for managing and maintaining multi-language pre-commit hooks."
optional = false
python-versions = ">=3.9"
files = [
{file = "pre_commit-3.7.0-py2.py3-none-any.whl", hash = "sha256:5eae9e10c2b5ac51577c3452ec0a490455c45a0533f7960f993a0d01e59decab"},
{file = "pre_commit-3.7.0.tar.gz", hash = "sha256:e209d61b8acdcf742404408531f0c37d49d2c734fd7cff2d6076083d191cb060"},
]
[package.dependencies]
cfgv = ">=2.0.0"
identify = ">=1.0.0"
nodeenv = ">=0.11.1"
pyyaml = ">=5.1"
virtualenv = ">=20.10.0"
[[package]] [[package]]
name = "pytest" name = "pytest"
version = "7.4.2" version = "7.4.2"
@@ -364,6 +438,66 @@ files = [
{file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"}, {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"},
] ]
[[package]]
name = "pyyaml"
version = "6.0.1"
description = "YAML parser and emitter for Python"
optional = false
python-versions = ">=3.6"
files = [
{file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
{file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
{file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
{file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
{file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
{file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
{file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
{file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
{file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
{file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
{file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
{file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
{file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
{file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
{file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
{file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
{file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
{file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
{file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
{file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
{file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
{file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
{file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
{file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
{file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
{file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
{file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
{file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
{file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
]
[[package]] [[package]]
name = "requests" name = "requests"
version = "2.31.0" version = "2.31.0"
@@ -385,6 +519,22 @@ urllib3 = ">=1.21.1,<3"
socks = ["PySocks (>=1.5.6,!=1.5.7)"] socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "setuptools"
version = "69.5.1"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "setuptools-69.5.1-py3-none-any.whl", hash = "sha256:c636ac361bc47580504644275c9ad802c50415c7522212252c033bd15f301f32"},
{file = "setuptools-69.5.1.tar.gz", hash = "sha256:6c1fccdac05a97e598fb0ae3bbed5904ccb317337a51139dcd51453611bbb987"},
]
[package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]] [[package]]
name = "six" name = "six"
version = "1.16.0" version = "1.16.0"
@@ -435,7 +585,27 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
zstd = ["zstandard (>=0.18.0)"] zstd = ["zstandard (>=0.18.0)"]
[[package]]
name = "virtualenv"
version = "20.25.1"
description = "Virtual Python Environment builder"
optional = false
python-versions = ">=3.7"
files = [
{file = "virtualenv-20.25.1-py3-none-any.whl", hash = "sha256:961c026ac520bac5f69acb8ea063e8a4f071bcc9457b9c1f28f6b085c511583a"},
{file = "virtualenv-20.25.1.tar.gz", hash = "sha256:e08e13ecdca7a0bd53798f356d5831434afa5b07b93f0abdf0797b7a06ffe197"},
]
[package.dependencies]
distlib = ">=0.3.7,<1"
filelock = ">=3.12.2,<4"
platformdirs = ">=3.9.1,<5"
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.10,<3.13" python-versions = ">=3.10,<3.13"
content-hash = "09ad811d74a42363ff4c3ccd012d8f73c89d7d978e5a6445b0f3d2e231922f1b" content-hash = "371781da268d5f61d6e798c023777f337b620e9b07a48c316825d7b998b63f02"

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "homeharvest" name = "homeharvest"
version = "0.3.12" version = "0.3.18"
description = "Real estate scraping library" description = "Real estate scraping library"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/HomeHarvest" homepage = "https://github.com/Bunsly/HomeHarvest"
@@ -17,6 +17,7 @@ pandas = "^2.1.1"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.2" pytest = "^7.4.2"
pre-commit = "^3.7.0"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]

View File

@@ -1,22 +1,12 @@
from homeharvest import scrape_property from homeharvest import scrape_property
from homeharvest.exceptions import (
InvalidListingType,
)
def test_realtor_pending_or_contingent(): def test_realtor_pending_or_contingent():
pending_or_contingent_result = scrape_property( pending_or_contingent_result = scrape_property(location="Surprise, AZ", listing_type="pending")
location="Surprise, AZ", listing_type="pending"
)
regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale") regular_result = scrape_property(location="Surprise, AZ", listing_type="for_sale")
assert all( assert all([result is not None for result in [pending_or_contingent_result, regular_result]])
[
result is not None
for result in [pending_or_contingent_result, regular_result]
]
)
assert len(pending_or_contingent_result) != len(regular_result) assert len(pending_or_contingent_result) != len(regular_result)
@@ -71,17 +61,13 @@ def test_realtor_comps():
def test_realtor_last_x_days_sold(): def test_realtor_last_x_days_sold():
days_result_30 = scrape_property( days_result_30 = scrape_property(location="Dallas, TX", listing_type="sold", past_days=30)
location="Dallas, TX", listing_type="sold", past_days=30
)
days_result_10 = scrape_property( days_result_10 = scrape_property(location="Dallas, TX", listing_type="sold", past_days=10)
location="Dallas, TX", listing_type="sold", past_days=10
)
assert all( assert all([result is not None for result in [days_result_30, days_result_10]]) and len(days_result_30) != len(
[result is not None for result in [days_result_30, days_result_10]] days_result_10
) and len(days_result_30) != len(days_result_10) )
def test_realtor_date_range_sold(): def test_realtor_date_range_sold():
@@ -93,9 +79,9 @@ def test_realtor_date_range_sold():
location="Dallas, TX", listing_type="sold", date_from="2023-04-01", date_to="2023-06-10" location="Dallas, TX", listing_type="sold", date_from="2023-04-01", date_to="2023-06-10"
) )
assert all( assert all([result is not None for result in [days_result_30, days_result_60]]) and len(days_result_30) < len(
[result is not None for result in [days_result_30, days_result_60]] days_result_60
) and len(days_result_30) < len(days_result_60) )
def test_realtor_single_property(): def test_realtor_single_property():
@@ -119,35 +105,40 @@ def test_realtor():
location="2530 Al Lipscomb Way", location="2530 Al Lipscomb Way",
listing_type="for_sale", listing_type="for_sale",
), ),
scrape_property( scrape_property(location="Phoenix, AZ", listing_type="for_rent"), #: does not support "city, state, USA" format
location="Phoenix, AZ", listing_type="for_rent" scrape_property(location="Dallas, TX", listing_type="sold"), #: does not support "city, state, USA" format
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", listing_type="sold"
), #: does not support "city, state, USA" format
scrape_property(location="85281"), scrape_property(location="85281"),
] ]
assert all([result is not None for result in results]) assert all([result is not None for result in results])
def test_realtor_city():
results = scrape_property(
location="Atlanta, GA",
listing_type="for_sale",
)
assert results is not None and len(results) > 0
def test_realtor_bad_address(): def test_realtor_bad_address():
bad_results = scrape_property( bad_results = scrape_property(
location="abceefg ju098ot498hh9", location="abceefg ju098ot498hh9",
listing_type="for_sale", listing_type="for_sale",
) )
if len(bad_results) == 0: if len(bad_results) == 0:
assert True assert True
def test_realtor_foreclosed(): def test_realtor_foreclosed():
foreclosed = scrape_property( foreclosed = scrape_property(location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=True)
location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=True
)
not_foreclosed = scrape_property( not_foreclosed = scrape_property(location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=False)
location="Dallas, TX", listing_type="for_sale", past_days=100, foreclosure=False
)
assert len(foreclosed) != len(not_foreclosed) assert len(foreclosed) != len(not_foreclosed)
def test_realtor_agent():
scraped = scrape_property(location="Detroit, MI", listing_type="for_sale")
assert scraped["agent"].nunique() > 1