add new request called part for smaller response

pull/231/head
Yariv Menachem 2024-12-16 15:48:59 +02:00
parent ce7bddf5c3
commit b4454bfc65
3 changed files with 76 additions and 1 deletions

View File

@ -9,6 +9,7 @@ from __future__ import annotations
from jobspy.scrapers import Scraper, ScraperInput from jobspy.scrapers import Scraper, ScraperInput
from jobspy.scrapers.goozali.model.GoozaliPartRequest import GoozaliPartRequest
from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest
from jobspy.scrapers.site import Site from jobspy.scrapers.site import Site
@ -55,6 +56,7 @@ class GoozaliScraper(Scraper):
seen_ids = set() seen_ids = set()
for view_id in self.view_ids: for view_id in self.view_ids:
full_request = GoozaliFullRequest(self.base_url) full_request = GoozaliFullRequest(self.base_url)
part_request = GoozaliPartRequest(self.base_url)
try: try:
response = self.session.get( response = self.session.get(
url=full_request.url, url=full_request.url,

View File

@ -9,7 +9,6 @@ class GoozaliFullRequest():
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s" self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s"
self.stringifiedObjectParams = { self.stringifiedObjectParams = {
"shouldUseNestedResponseFormat": "true"} "shouldUseNestedResponseFormat": "true"}
self.session_id: str = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
self.cookies: dict[str, str] = {} self.cookies: dict[str, str] = {}
self.request_id: str = "req4q4tKw3woEEWxw&" self.request_id: str = "req4q4tKw3woEEWxw&"
self.share_id: str = "shrQBuWjXd0YgPqV6" self.share_id: str = "shrQBuWjXd0YgPqV6"

View File

@ -0,0 +1,74 @@
import json
class GoozaliPartRequest():
def __init__(self, base_url: str):
self.view_id: str = "viwNRSqqmqZLP0a3C"
self.url = base_url.format(view_id=self.view_id)
self.application_id: str = "app7OQjqEzTtCRq7u"
self.air_table_page_load_id: str = "pglG8mlPvtT0UiBaN"
self.stringifiedObjectParams = {
"shouldUseNestedResponseFormat": "true"}
self.session_id: str = ""
self.cookies: dict[str, str] = {}
self.request_id: str = "requFlC1ueInFAWHe"
self.share_id: str = "shrNtlFxOG2ag1kyB"
self.signature: str = "64689d9701d871b8f3a3fe8ad01de23c06421011eb92a8816399a9e2a869b523"
self.headers = self._generate_headers()
self.params = self._generate_params()
self.cookies = {}
def _generate_params(self) -> dict[str, str]:
access_policy = self._generate_access_policy()
return {
"stringifiedObjectParams": self.stringifiedObjectParams,
"request_id": self.request_id,
"accessPolicy": access_policy
}
def _generate_headers(self) -> str:
return {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7',
'priority': 'u=1, i',
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'x-airtable-accept-msgpack': 'true',
'x-airtable-application-id': self.application_id,
'x-airtable-inter-service-client': 'webClient',
'x-airtable-page-load-id': self.air_table_page_load_id,
'x-early-prefetch': 'true',
'x-requested-with': 'XMLHttpRequest',
'x-time-zone': 'Asia/Jerusalem',
'x-user-locale': 'en'
}
def _generate_access_policy(self) -> str:
"""
Generates a JSON string for access policy.
"""
access_policy = {
"allowedActions": [
{"modelClassName": "view", "modelIdSelector": self.view_id,
"action": "readSharedViewData"},
{"modelClassName": "view", "modelIdSelector": self.view_id,
"action": "getMetadataForPrinting"},
{"modelClassName": "view", "modelIdSelector": self.view_id,
"action": "readSignedAttachmentUrls"},
{"modelClassName": "row", "modelIdSelector": f"rows *[displayedInView={self.view_id}]",
"action": "createDocumentPreviewSession"}
],
"shareId": self.share_id,
"applicationId": self.application_id,
"generationNumber": 0,
"expires": "2025-01-02T00:00:00.000Z",
"signature": self.signature
}
# Convert to a JSON string
return json.dumps(access_policy)