From ce7bddf5c3b9a3c28f6855321521bb24b6b1a841 Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Sun, 15 Dec 2024 21:42:53 +0200 Subject: [PATCH] created new class to hold all data for the big request NEXT: create another based on this to get smaller response --- src/jobspy/scrapers/goozali/__init__.py | 30 +++----- .../scrapers/goozali/model/FullRequest.py | 74 +++++++++++++++++++ .../scrapers/goozali/model/GoozaliRequest.py | 8 ++ 3 files changed, 91 insertions(+), 21 deletions(-) create mode 100644 src/jobspy/scrapers/goozali/model/FullRequest.py create mode 100644 src/jobspy/scrapers/goozali/model/GoozaliRequest.py diff --git a/src/jobspy/scrapers/goozali/__init__.py b/src/jobspy/scrapers/goozali/__init__.py index 117a8db..79b505e 100644 --- a/src/jobspy/scrapers/goozali/__init__.py +++ b/src/jobspy/scrapers/goozali/__init__.py @@ -9,10 +9,11 @@ from __future__ import annotations from jobspy.scrapers import Scraper, ScraperInput +from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest from jobspy.scrapers.site import Site from ..utils import create_session, create_logger -from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids +from .constants import headers from ...jobs import ( JobPost, JobResponse, @@ -41,16 +42,7 @@ class GoozaliScraper(Scraper): clear_cookies=False, ) self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData" - - def _get_params(self, view_id: str) -> dict[str, str]: - access_policy: str = get_access_policy(view_id) - params = { - "stringifiedObjectParams": stringifiedObjectParams, - "request_id": request_id, - "accessPolicy": access_policy - } - - return params + self.view_ids = ["viwIOzPYaUGxlA0Jd"] def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -61,19 +53,15 @@ class GoozaliScraper(Scraper): self.scraper_input = scraper_input job_list: list[JobPost] = [] seen_ids = set() - for view_id in view_ids: - # create url - url = self.base_url.format(view_id=view_id) - params = self._get_params(view_id) - # create session -> run the api + for view_id in self.view_ids: + full_request = GoozaliFullRequest(self.base_url) try: response = self.session.get( - url=url, - params=params, + url=full_request.url, + params=full_request.params, timeout=10, - headers=headers, - cookies=cookies - ) + headers=full_request.headers, + cookies=full_request.cookies) logger.info(f"response: {str(response)}") if (response.status_code != 200): logger.error(f"Status code: { diff --git a/src/jobspy/scrapers/goozali/model/FullRequest.py b/src/jobspy/scrapers/goozali/model/FullRequest.py new file mode 100644 index 0000000..02de03d --- /dev/null +++ b/src/jobspy/scrapers/goozali/model/FullRequest.py @@ -0,0 +1,74 @@ +import json + + +class GoozaliFullRequest(): + def __init__(self, base_url: str): + self.view_id: str = "viwIOzPYaUGxlA0Jd" + self.url = base_url.format(view_id=self.view_id) + self.application_id: str = "appwewqLk7iUY4azc" + self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s" + self.stringifiedObjectParams = { + "shouldUseNestedResponseFormat": "true"} + self.session_id: str = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E=" + self.cookies: dict[str, str] = {} + self.request_id: str = "req4q4tKw3woEEWxw&" + self.share_id: str = "shrQBuWjXd0YgPqV6" + self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59" + self.headers = self._generate_headers() + self.params = self._generate_params() + self.cookies = {} + + def _generate_params(self) -> dict[str, str]: + access_policy = self._generate_access_policy() + + return { + "stringifiedObjectParams": self.stringifiedObjectParams, + "request_id": self.request_id, + "accessPolicy": access_policy + } + + def _generate_headers(self) -> str: + return { + 'accept': '*/*', + 'accept-language': 'en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7', + 'priority': 'u=1, i', + 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', + 'x-airtable-accept-msgpack': 'true', + 'x-airtable-application-id': self.application_id, + 'x-airtable-inter-service-client': 'webClient', + 'x-airtable-page-load-id': self.air_table_page_load_id, + 'x-early-prefetch': 'true', + 'x-requested-with': 'XMLHttpRequest', + 'x-time-zone': 'Asia/Jerusalem', + 'x-user-locale': 'en' + } + + def _generate_access_policy(self) -> str: + """ + Generates a JSON string for access policy. + """ + access_policy = { + "allowedActions": [ + {"modelClassName": "view", "modelIdSelector": self.view_id, + "action": "readSharedViewData"}, + {"modelClassName": "view", "modelIdSelector": self.view_id, + "action": "getMetadataForPrinting"}, + {"modelClassName": "view", "modelIdSelector": self.view_id, + "action": "readSignedAttachmentUrls"}, + {"modelClassName": "row", "modelIdSelector": f"rows *[displayedInView={self.view_id}]", + "action": "createDocumentPreviewSession"} + ], + "shareId": self.share_id, + "applicationId": self.application_id, + "generationNumber": 0, + "expires": "2025-01-02T00:00:00.000Z", + "signature": self.signature + } + # Convert to a JSON string + return json.dumps(access_policy) diff --git a/src/jobspy/scrapers/goozali/model/GoozaliRequest.py b/src/jobspy/scrapers/goozali/model/GoozaliRequest.py new file mode 100644 index 0000000..3ec3594 --- /dev/null +++ b/src/jobspy/scrapers/goozali/model/GoozaliRequest.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod + + +class GoozaliRequest(ABC): + @abstractmethod + def create(self): + """Abstract method to be implemented in subclasses.""" + pass