feat: proxy support

This commit is contained in:
Cullen Watson
2023-09-19 15:43:24 -05:00
parent 49d27943c4
commit fd9cdea499
7 changed files with 54 additions and 33 deletions

View File

@@ -8,7 +8,7 @@ class ScraperInput:
location: str
listing_type: ListingType
site_name: SiteName
proxy_url: str | None = None
proxy: str | None = None
class Scraper:
@@ -20,11 +20,9 @@ class Scraper:
self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name
if scraper_input.proxy_url:
self.session.proxies = {
"http": scraper_input.proxy_url,
"https": scraper_input.proxy_url,
}
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(
scraper_input.proxy
)
def search(self) -> list[Property]:
...

View File

@@ -39,6 +39,7 @@ class RealtorScraper(Scraper):
"https://parser-external.geo.moveaws.com/suggest",
params=params,
headers=headers,
proxies=self.proxy,
)
response_json = response.json()
@@ -104,7 +105,7 @@ class RealtorScraper(Scraper):
"variables": variables,
}
response = self.session.post(self.search_url, json=payload)
response = self.session.post(self.search_url, json=payload, proxies=self.proxy)
response_json = response.json()
property_info = response_json["data"]["property"]
@@ -217,7 +218,7 @@ class RealtorScraper(Scraper):
"variables": variables,
}
response = self.session.post(self.search_url, json=payload)
response = self.session.post(self.search_url, json=payload, proxies=self.proxy)
response.raise_for_status()
response_json = response.json()

View File

@@ -16,7 +16,7 @@ class RedfinScraper(Scraper):
self.location
)
response = self.session.get(url)
response = self.session.get(url, proxies=self.proxy)
response_json = json.loads(response.text.replace("{}&&", ""))
def get_region_type(match_type: str):
@@ -111,7 +111,7 @@ class RedfinScraper(Scraper):
def _handle_rentals(self, region_id, region_type):
url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true&region_id={region_id}&region_type={region_type}&num_homes=100000"
response = self.session.get(url)
response = self.session.get(url, proxies=self.proxy)
response.raise_for_status()
homes = response.json()
@@ -211,7 +211,7 @@ class RedfinScraper(Scraper):
home_id
)
response = self.session.get(url)
response = self.session.get(url, proxies=self.proxy)
response_json = json.loads(response.text.replace("{}&&", ""))
parsed_home = self._parse_home(
@@ -233,7 +233,7 @@ class RedfinScraper(Scraper):
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&num_homes=100000"
else:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
response = self.session.get(url)
response = self.session.get(url, proxies=self.proxy)
response_json = json.loads(response.text.replace("{}&&", ""))
homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]

View File

@@ -1,6 +1,5 @@
import re
import json
import string
from .. import Scraper
from ....utils import parse_address_two, parse_unit
from ....exceptions import GeoCoordsNotFound, NoResultsFound
@@ -27,12 +26,14 @@ class ZillowScraper(Scraper):
"}&abKey=6666272a-4b99-474c-b857-110ec438732b&clientId=homepage-render"
).format(location)
response = self.session.get(url)
response = self.session.get(url, proxies=self.proxy)
return response.json()["results"] != []
def search(self):
resp = self.session.get(self.url, headers=self._get_headers())
resp = self.session.get(
self.url, headers=self._get_headers(), proxies=self.proxy
)
resp.raise_for_status()
content = resp.text
@@ -129,7 +130,9 @@ class ZillowScraper(Scraper):
"wants": {"cat1": ["mapResults"]},
"isDebugRequest": False,
}
resp = self.session.put(url, headers=self._get_headers(), json=payload)
resp = self.session.put(
url, headers=self._get_headers(), json=payload, proxies=self.proxy
)
resp.raise_for_status()
a = resp.json()
return self._parse_properties(resp.json())