mirror of https://github.com/Bunsly/JobSpy
created new class to hold all data for the big request NEXT: create another based on this to get smaller response
parent
c40c4745d7
commit
ce7bddf5c3
|
@ -9,10 +9,11 @@ from __future__ import annotations
|
|||
|
||||
|
||||
from jobspy.scrapers import Scraper, ScraperInput
|
||||
from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest
|
||||
from jobspy.scrapers.site import Site
|
||||
|
||||
from ..utils import create_session, create_logger
|
||||
from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids
|
||||
from .constants import headers
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
JobResponse,
|
||||
|
@ -41,16 +42,7 @@ class GoozaliScraper(Scraper):
|
|||
clear_cookies=False,
|
||||
)
|
||||
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
||||
|
||||
def _get_params(self, view_id: str) -> dict[str, str]:
|
||||
access_policy: str = get_access_policy(view_id)
|
||||
params = {
|
||||
"stringifiedObjectParams": stringifiedObjectParams,
|
||||
"request_id": request_id,
|
||||
"accessPolicy": access_policy
|
||||
}
|
||||
|
||||
return params
|
||||
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
|
@ -61,19 +53,15 @@ class GoozaliScraper(Scraper):
|
|||
self.scraper_input = scraper_input
|
||||
job_list: list[JobPost] = []
|
||||
seen_ids = set()
|
||||
for view_id in view_ids:
|
||||
# create url
|
||||
url = self.base_url.format(view_id=view_id)
|
||||
params = self._get_params(view_id)
|
||||
# create session -> run the api
|
||||
for view_id in self.view_ids:
|
||||
full_request = GoozaliFullRequest(self.base_url)
|
||||
try:
|
||||
response = self.session.get(
|
||||
url=url,
|
||||
params=params,
|
||||
url=full_request.url,
|
||||
params=full_request.params,
|
||||
timeout=10,
|
||||
headers=headers,
|
||||
cookies=cookies
|
||||
)
|
||||
headers=full_request.headers,
|
||||
cookies=full_request.cookies)
|
||||
logger.info(f"response: {str(response)}")
|
||||
if (response.status_code != 200):
|
||||
logger.error(f"Status code: {
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
import json
|
||||
|
||||
|
||||
class GoozaliFullRequest():
|
||||
def __init__(self, base_url: str):
|
||||
self.view_id: str = "viwIOzPYaUGxlA0Jd"
|
||||
self.url = base_url.format(view_id=self.view_id)
|
||||
self.application_id: str = "appwewqLk7iUY4azc"
|
||||
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s"
|
||||
self.stringifiedObjectParams = {
|
||||
"shouldUseNestedResponseFormat": "true"}
|
||||
self.session_id: str = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
|
||||
self.cookies: dict[str, str] = {}
|
||||
self.request_id: str = "req4q4tKw3woEEWxw&"
|
||||
self.share_id: str = "shrQBuWjXd0YgPqV6"
|
||||
self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
|
||||
self.headers = self._generate_headers()
|
||||
self.params = self._generate_params()
|
||||
self.cookies = {}
|
||||
|
||||
def _generate_params(self) -> dict[str, str]:
|
||||
access_policy = self._generate_access_policy()
|
||||
|
||||
return {
|
||||
"stringifiedObjectParams": self.stringifiedObjectParams,
|
||||
"request_id": self.request_id,
|
||||
"accessPolicy": access_policy
|
||||
}
|
||||
|
||||
def _generate_headers(self) -> str:
|
||||
return {
|
||||
'accept': '*/*',
|
||||
'accept-language': 'en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7',
|
||||
'priority': 'u=1, i',
|
||||
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
'x-airtable-accept-msgpack': 'true',
|
||||
'x-airtable-application-id': self.application_id,
|
||||
'x-airtable-inter-service-client': 'webClient',
|
||||
'x-airtable-page-load-id': self.air_table_page_load_id,
|
||||
'x-early-prefetch': 'true',
|
||||
'x-requested-with': 'XMLHttpRequest',
|
||||
'x-time-zone': 'Asia/Jerusalem',
|
||||
'x-user-locale': 'en'
|
||||
}
|
||||
|
||||
def _generate_access_policy(self) -> str:
|
||||
"""
|
||||
Generates a JSON string for access policy.
|
||||
"""
|
||||
access_policy = {
|
||||
"allowedActions": [
|
||||
{"modelClassName": "view", "modelIdSelector": self.view_id,
|
||||
"action": "readSharedViewData"},
|
||||
{"modelClassName": "view", "modelIdSelector": self.view_id,
|
||||
"action": "getMetadataForPrinting"},
|
||||
{"modelClassName": "view", "modelIdSelector": self.view_id,
|
||||
"action": "readSignedAttachmentUrls"},
|
||||
{"modelClassName": "row", "modelIdSelector": f"rows *[displayedInView={self.view_id}]",
|
||||
"action": "createDocumentPreviewSession"}
|
||||
],
|
||||
"shareId": self.share_id,
|
||||
"applicationId": self.application_id,
|
||||
"generationNumber": 0,
|
||||
"expires": "2025-01-02T00:00:00.000Z",
|
||||
"signature": self.signature
|
||||
}
|
||||
# Convert to a JSON string
|
||||
return json.dumps(access_policy)
|
|
@ -0,0 +1,8 @@
|
|||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class GoozaliRequest(ABC):
|
||||
@abstractmethod
|
||||
def create(self):
|
||||
"""Abstract method to be implemented in subclasses."""
|
||||
pass
|
Loading…
Reference in New Issue