diff --git a/README.md b/README.md index 76a1618..ad72eed 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,8 @@ Optional Property ├── Basic Information: │ ├── property_url +│ ├── property_id +│ ├── listing_id │ ├── mls │ ├── mls_id │ └── status diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 5fd018d..745d766 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -46,8 +46,21 @@ class Scraper: Scraper.session.mount("https://", adapter) Scraper.session.headers.update( { - "auth": f"Bearer {self.get_access_token()}", - "apollographql-client-name": "com.move.Realtor-apollo-ios", + 'accept': 'application/json, text/javascript', + 'accept-language': 'en-US,en;q=0.9', + 'cache-control': 'no-cache', + 'content-type': 'application/json', + 'origin': 'https://www.realtor.com', + 'pragma': 'no-cache', + 'priority': 'u=1, i', + 'rdc-ab-tests': 'commute_travel_time_variation:v1', + 'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', } ) diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index bbda390..98be66b 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -133,6 +133,10 @@ class Advertisers: @dataclass class Property: property_url: str + + property_id: str + listing_id: str | None = None + mls: str | None = None mls_id: str | None = None status: str | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 459c9c7..355b5b3 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -181,11 +181,9 @@ class RealtorScraper(Scraper): if "source" in result and isinstance(result["source"], dict) else None ), - property_url=( - f"{self.PROPERTY_URL}{property_id}" - if self.listing_type != ListingType.FOR_RENT - else f"{self.PROPERTY_URL}M{property_id}?listing_status=rental" - ), + property_url=result["href"], + property_id=property_id, + listing_id=result.get("listing_id"), status="PENDING" if is_pending else result["status"].upper(), list_price=result["list_price"], list_price_min=result["list_price_min"], @@ -469,7 +467,7 @@ class RealtorScraper(Scraper): }""" variables = {"property_id": property_id} - response = self.session.post(self.PROPERTY_GQL, json={"query": query, "variables": variables}) + response = self.session.post(self.SEARCH_GQL_URL, json={"query": query, "variables": variables}) data = response.json() property_details = data["data"]["home"] diff --git a/homeharvest/core/scrapers/realtor/queries.py b/homeharvest/core/scrapers/realtor/queries.py index 9ae4d18..4df8e91 100644 --- a/homeharvest/core/scrapers/realtor/queries.py +++ b/homeharvest/core/scrapers/realtor/queries.py @@ -2,6 +2,7 @@ _SEARCH_HOMES_DATA_BASE = """{ pending_date listing_id property_id + href list_date status last_sold_price diff --git a/homeharvest/utils.py b/homeharvest/utils.py index 6de5292..e754190 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -6,6 +6,8 @@ from .exceptions import InvalidListingType, InvalidDate ordered_properties = [ "property_url", + "property_id", + "listing_id", "mls", "mls_id", "status", diff --git a/pyproject.toml b/pyproject.toml index 110013e..2f0d8cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.4.2" +version = "0.4.3" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 1585a3d..284875a 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -128,6 +128,7 @@ def test_realtor_bad_address(): location="abceefg ju098ot498hh9", listing_type="for_sale", ) + if len(bad_results) == 0: assert True @@ -253,3 +254,29 @@ def test_builder_exists(): assert listing is not None assert listing["builder_name"].nunique() > 0 + + +def test_phone_number_matching(): + searches = [ + scrape_property( + location="Phoenix, AZ", + listing_type="for_sale", + limit=100, + ), + scrape_property( + location="Phoenix, AZ", + listing_type="for_sale", + limit=100, + ), + ] + + assert all([search is not None for search in searches]) + + #: random row + row = searches[0][searches[0]["agent_phones"].notnull()].sample() + + #: find matching row + matching_row = searches[1].loc[searches[1]["property_url"] == row["property_url"].values[0]] + + #: assert phone numbers are the same + assert row["agent_phones"].values[0] == matching_row["agent_phones"].values[0]