From d05bc5d79fa41a047c3de9a175f3d78d3d033a85 Mon Sep 17 00:00:00 2001 From: Cullen Date: Thu, 4 Apr 2024 17:05:00 -0500 Subject: [PATCH] fix: redfin --- example.py | 11 +++++++++++ homeharvest/core/scrapers/__init__.py | 1 + homeharvest/core/scrapers/redfin/__init__.py | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 example.py diff --git a/example.py b/example.py new file mode 100644 index 0000000..a926140 --- /dev/null +++ b/example.py @@ -0,0 +1,11 @@ +from homeharvest import scrape_property +import pandas as pd + +properties: pd.DataFrame = scrape_property( + site_name=["redfin"], + location="85281", + listing_type="for_rent" # for_sale / sold +) + +print(properties) +properties.to_csv('properties.csv', index=False) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 95ed3e1..3107d5e 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -17,6 +17,7 @@ class Scraper: self.listing_type = scraper_input.listing_type self.session = requests.Session() + self.session.headers.update({"user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}) if scraper_input.proxy: proxy_url = scraper_input.proxy proxies = {"http": proxy_url, "https": proxy_url} diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index 80b91f8..9cdf28d 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -21,7 +21,7 @@ class RedfinScraper(Scraper): def _handle_location(self): url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(self.location) - response = self.session.get(url) + response = self.session.get(url, headers={"user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}) response_json = json.loads(response.text.replace("{}&&", "")) def get_region_type(match_type: str):