mirror of
https://github.com/Bunsly/HomeHarvest.git
synced 2026-03-05 12:04:31 -08:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8388d47f73 | ||
|
|
ba503b0ca3 | ||
|
|
8962d619e1 | ||
|
|
3b7c17b7b5 |
@@ -1,5 +1,6 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import requests
|
import requests
|
||||||
|
import tls_client
|
||||||
from .models import Property, ListingType, SiteName
|
from .models import Property, ListingType, SiteName
|
||||||
|
|
||||||
|
|
||||||
@@ -12,15 +13,20 @@ class ScraperInput:
|
|||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, scraper_input: ScraperInput):
|
def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_client.Session = None):
|
||||||
self.location = scraper_input.location
|
self.location = scraper_input.location
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
|
|
||||||
self.session = requests.Session()
|
if not session:
|
||||||
|
self.session = requests.Session()
|
||||||
|
else:
|
||||||
|
self.session = session
|
||||||
|
|
||||||
if scraper_input.proxy:
|
if scraper_input.proxy:
|
||||||
proxy_url = scraper_input.proxy
|
proxy_url = scraper_input.proxy
|
||||||
proxies = {"http": proxy_url, "https": proxy_url}
|
proxies = {"http": proxy_url, "https": proxy_url}
|
||||||
self.session.proxies.update(proxies)
|
self.session.proxies.update(proxies)
|
||||||
|
|
||||||
self.listing_type = scraper_input.listing_type
|
self.listing_type = scraper_input.listing_type
|
||||||
self.site_name = scraper_input.site_name
|
self.site_name = scraper_input.site_name
|
||||||
|
|
||||||
|
|||||||
@@ -20,19 +20,17 @@ from datetime import datetime, timedelta
|
|||||||
|
|
||||||
class ZillowScraper(Scraper):
|
class ZillowScraper(Scraper):
|
||||||
def __init__(self, scraper_input):
|
def __init__(self, scraper_input):
|
||||||
super().__init__(scraper_input)
|
session = tls_client.Session(
|
||||||
self.session = tls_client.Session(
|
|
||||||
client_identifier="chrome112", random_tls_extension_order=True
|
client_identifier="chrome112", random_tls_extension_order=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
super().__init__(scraper_input, session)
|
||||||
|
|
||||||
self.session.headers.update({
|
self.session.headers.update({
|
||||||
'authority': 'www.zillow.com',
|
'authority': 'www.zillow.com',
|
||||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||||
'accept-language': 'en-US,en;q=0.9',
|
'accept-language': 'en-US,en;q=0.9',
|
||||||
'cache-control': 'max-age=0',
|
'cache-control': 'max-age=0',
|
||||||
'sec-ch-ua': '"Chromium";v="117", "Not)A;Brand";v="24", "Google Chrome";v="117"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': '"Windows"',
|
|
||||||
'sec-fetch-dest': 'document',
|
'sec-fetch-dest': 'document',
|
||||||
'sec-fetch-mode': 'navigate',
|
'sec-fetch-mode': 'navigate',
|
||||||
'sec-fetch-site': 'same-origin',
|
'sec-fetch-site': 'same-origin',
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "homeharvest"
|
name = "homeharvest"
|
||||||
version = "0.2.17"
|
version = "0.2.19"
|
||||||
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
homepage = "https://github.com/ZacharyHampton/HomeHarvest"
|
||||||
|
|||||||
Reference in New Issue
Block a user