created new class to hold all data for the big request NEXT: create another based on this to get smaller response

pull/231/head
Yariv Menachem 2024-12-15 21:42:53 +02:00
parent c40c4745d7
commit ce7bddf5c3
3 changed files with 91 additions and 21 deletions
src/jobspy/scrapers/goozali

View File

@ -9,10 +9,11 @@ from __future__ import annotations
from jobspy.scrapers import Scraper, ScraperInput
from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest
from jobspy.scrapers.site import Site
from ..utils import create_session, create_logger
from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids
from .constants import headers
from ...jobs import (
JobPost,
JobResponse,
@ -41,16 +42,7 @@ class GoozaliScraper(Scraper):
clear_cookies=False,
)
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
def _get_params(self, view_id: str) -> dict[str, str]:
access_policy: str = get_access_policy(view_id)
params = {
"stringifiedObjectParams": stringifiedObjectParams,
"request_id": request_id,
"accessPolicy": access_policy
}
return params
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@ -61,19 +53,15 @@ class GoozaliScraper(Scraper):
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
for view_id in view_ids:
# create url
url = self.base_url.format(view_id=view_id)
params = self._get_params(view_id)
# create session -> run the api
for view_id in self.view_ids:
full_request = GoozaliFullRequest(self.base_url)
try:
response = self.session.get(
url=url,
params=params,
url=full_request.url,
params=full_request.params,
timeout=10,
headers=headers,
cookies=cookies
)
headers=full_request.headers,
cookies=full_request.cookies)
logger.info(f"response: {str(response)}")
if (response.status_code != 200):
logger.error(f"Status code: {

View File

@ -0,0 +1,74 @@
import json
class GoozaliFullRequest():
def __init__(self, base_url: str):
self.view_id: str = "viwIOzPYaUGxlA0Jd"
self.url = base_url.format(view_id=self.view_id)
self.application_id: str = "appwewqLk7iUY4azc"
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s"
self.stringifiedObjectParams = {
"shouldUseNestedResponseFormat": "true"}
self.session_id: str = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
self.cookies: dict[str, str] = {}
self.request_id: str = "req4q4tKw3woEEWxw&"
self.share_id: str = "shrQBuWjXd0YgPqV6"
self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
self.headers = self._generate_headers()
self.params = self._generate_params()
self.cookies = {}
def _generate_params(self) -> dict[str, str]:
access_policy = self._generate_access_policy()
return {
"stringifiedObjectParams": self.stringifiedObjectParams,
"request_id": self.request_id,
"accessPolicy": access_policy
}
def _generate_headers(self) -> str:
return {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7',
'priority': 'u=1, i',
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'x-airtable-accept-msgpack': 'true',
'x-airtable-application-id': self.application_id,
'x-airtable-inter-service-client': 'webClient',
'x-airtable-page-load-id': self.air_table_page_load_id,
'x-early-prefetch': 'true',
'x-requested-with': 'XMLHttpRequest',
'x-time-zone': 'Asia/Jerusalem',
'x-user-locale': 'en'
}
def _generate_access_policy(self) -> str:
"""
Generates a JSON string for access policy.
"""
access_policy = {
"allowedActions": [
{"modelClassName": "view", "modelIdSelector": self.view_id,
"action": "readSharedViewData"},
{"modelClassName": "view", "modelIdSelector": self.view_id,
"action": "getMetadataForPrinting"},
{"modelClassName": "view", "modelIdSelector": self.view_id,
"action": "readSignedAttachmentUrls"},
{"modelClassName": "row", "modelIdSelector": f"rows *[displayedInView={self.view_id}]",
"action": "createDocumentPreviewSession"}
],
"shareId": self.share_id,
"applicationId": self.application_id,
"generationNumber": 0,
"expires": "2025-01-02T00:00:00.000Z",
"signature": self.signature
}
# Convert to a JSON string
return json.dumps(access_policy)

View File

@ -0,0 +1,8 @@
from abc import ABC, abstractmethod
class GoozaliRequest(ABC):
@abstractmethod
def create(self):
"""Abstract method to be implemented in subclasses."""
pass