added json file to test the mapper, from json response to classes

pull/231/head
Yariv Menachem 2024-12-16 20:32:07 +02:00
parent b4454bfc65
commit bbe3d6df35
8 changed files with 29558 additions and 91 deletions

View File

@ -0,0 +1,77 @@
import json
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
# Mapping function to convert parsed dictionary into GoozaliResponseData
class GoozaliMapper:
def _map_dict_to_goozali_response_column_choice(self, column_choices: dict) -> dict[str, GoozaliColumnChoice]:
# Create a dictionary to store GoozaliColumnChoice objects
goolzali_column_choices: dict[str, GoozaliColumnChoice] = {}
# Map the data to GoozaliColumnChoice instances
for key, value in column_choices.items():
goolzali_column_choices[key] = GoozaliColumnChoice(
id=value['id'],
name=value['name'],
# Using get to safely access 'color', it may not always be present
color=value.get('color', "")
)
return goolzali_column_choices
def _map_dict_to_goozali_response_column_type_option(self, type_options: dict) -> GoozaliColumnTypeOptions:
goozali_type_options = GoozaliColumnTypeOptions(
typeOptions=type_options)
if goozali_type_options.choices:
goozali_type_options.choices = self._map_dict_to_goozali_response_column_choice(
goozali_type_options.choices)
return goozali_type_options
def _map_dict_to_goozali_response_columns(self, columns: list) -> list[GoozaliColumn]:
goozali_columns: list[GoozaliColumn] = []
for column in columns:
goozali_column = GoozaliColumn(**column)
if goozali_column.typeOptions:
goozali_column.typeOptions = self._map_dict_to_goozali_response_column_type_option(
goozali_column.typeOptions)
goozali_columns.append(goozali_column)
return goozali_columns
def _map_dict_to_goozali_response_data(self, data: dict) -> GoozaliResponseData:
columns = self._map_dict_to_goozali_response_columns(data['columns'])
rows = [GoozaliRow(**row) for row in data['rows']]
return GoozaliResponseData(
applicationId=data['applicationId'],
id=data['id'],
name=data['name'],
columns=columns,
primaryColumnId=data['primaryColumnId'],
meaningfulColumnOrder=data['meaningfulColumnOrder'],
viewOrder=data['viewOrder'],
rows=rows
)
# Updated map response function
def map_response_to_goozali_response(self, response) -> GoozaliResponse:
# Check the response content (this is a bytes object)
response_content = response.content
# Decode the byte content to a string
decoded_content = response_content.decode('utf-8')
# Now you can parse the decoded content as JSON
data = json.loads(decoded_content)
# Convert the 'data' dictionary into GoozaliResponseData object
data_obj = self._map_dict_to_goozali_response_data(data['data'])
# Return a new GoozaliResponse with msg and the converted data
return GoozaliResponse(msg=data['msg'], data=data_obj)

View File

@ -6,11 +6,13 @@ This module contains routines to scrape Goozali.
"""
from __future__ import annotations
import datetime
import json
from jobspy.scrapers import Scraper, ScraperInput
from jobspy.scrapers.goozali.model.GoozaliPartRequest import GoozaliPartRequest
from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn, GoozaliResponse, GoozaliPartRequest, GoozaliFullRequest
from jobspy.scrapers.site import Site
from ..utils import create_session, create_logger
@ -42,9 +44,55 @@ class GoozaliScraper(Scraper):
delay=5,
clear_cookies=False,
)
self.mapper = GoozaliMapper()
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
def map_respone_to_goozali_response(self, response) -> GoozaliResponse:
# Check the response content (this is a bytes object)
response_content = response.content
# Decode the byte content to a string
decoded_content = response_content.decode('utf-8')
# Now you can parse the decoded content as JSON
data = json.loads(decoded_content)
return GoozaliResponse(**data)
# Function to filter GoozaliRows based on hours old
def filter_rows_by_hours(rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
# Current time
now = datetime.utcnow()
# Calculate the time delta for the given hours
time_delta = datetime.timedelta(hours=hours)
# Filter rows
filtered_rows = [
row for row in rows
if now - datetime.strptime(row.createdTime, "%Y-%m-%dT%H:%M:%S.%fZ") <= time_delta
]
return filtered_rows
def find_column(self, columns: list[GoozaliColumn], column_name: str) -> GoozaliColumn:
for column in columns:
if (column.name == column_name):
return column
# def filter_rows_by_field_column(rows: list[GoozaliRow], field_column: Column) -> list[GoozaliRow]:
# # Current time
# now = datetime.utcnow()
# # Calculate the time delta for the given hours
# time_delta = datetime.timedelta(hours=hours)
# # Filter rows
# filtered_rows = [
# row for row in rows
# if now - datetime.strptime(row.createdTime, "%Y-%m-%dT%H:%M:%S.%fZ") <= time_delta
# ]
# return filtered_rows
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Goozali for jobs with scraper_input criteria
@ -66,14 +114,23 @@ class GoozaliScraper(Scraper):
cookies=full_request.cookies)
logger.info(f"response: {str(response)}")
if (response.status_code != 200):
logger.error(f"Status code: {
response.status_code}, Error: {str(response.text)}")
logger.error(f"Status code: {response.status_code}, Error: {
str(response.text)}")
return JobResponse(jobs=job_list)
except Exception as e:
logger.error(f"Exception: {str(e)}")
# model the response with models
goozali_response = self.mapper.map_response_to_goozali_response(
response=response)
# goozali_response: GoozaliResponse = self.map_respone_to_goozali_response(
# response)
# create map columnId to Column object
field = self.find_column(
goozali_response.data.columns, "Field")
# filter result by Field like the web
# filter by date
# filtered_rows_by_age = filter_rows_by_hours(
# goozali_response.data.table.rows, scraper_input.hours_old)
# map to JobResponse Object
return JobResponse(jobs=job_list)

View File

@ -1,10 +1,23 @@
from typing import Dict, List
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
class GoozaliColumnTypeOptions:
def __init__(self, choiceOrder: List[str], choices: Dict[str, GoozaliColumnChoice], disableColors: bool):
def __init__(self, choiceOrder: list[str], choices: dict[str, GoozaliColumnChoice], disableColors: bool):
self.choiceOrder = choiceOrder
self.choices = choices
self.disableColors = disableColors
def __init__(self, typeOptions: dict):
self.choiceOrder = typeOptions.get("choiceOrder", [])
self.choices: dict[str, GoozaliColumnChoice] = typeOptions.get(
"choices", {})
self.disableColors = typeOptions.get("disableColors", False)
self.dateFormat = typeOptions.get("dateFormat", "")
self.isDateTime = typeOptions.get("isDateTime", False)
self.timeZone = typeOptions.get("timeZone", "")
self.shouldDisplayTimeZone = typeOptions.get(
"shouldDisplayTimeZone", False)
self.formulaTextParsed = typeOptions.get("formulaTextParsed", "")
self.dependencies = typeOptions.get("dependencies", [])
self.resultType = typeOptions.get("resultType", "")
self.resultIsArray = typeOptions.get("resultIsArray", False)

View File

@ -1,17 +0,0 @@
from typing import Dict, List
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
class GoozaliTable:
def __init__(self, applicationId: str, id: str, name: str, columns: List[GoozaliColumn], primaryColumnId: str,
meaningfulColumnOrder: List[Dict[str, str]], viewOrder: List[str], rows: List[GoozaliRow]):
self.applicationId = applicationId
self.id = id
self.name = name
self.columns = columns
self.primaryColumnId = primaryColumnId
self.meaningfulColumnOrder = meaningfulColumnOrder
self.viewOrder = viewOrder
self.rows = rows

View File

@ -1,6 +1,15 @@
from jobspy.scrapers.goozali.model import GoozaliTable
from jobspy.scrapers.goozali.model import GoozaliRow
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
class GoozaliResponseData:
def __init__(self, table: GoozaliTable):
self.table = table
def __init__(self, applicationId: str, id: str, name: str, columns: list[GoozaliColumn], primaryColumnId: str,
meaningfulColumnOrder: list[dict[str, str]], viewOrder: list[str], rows: list[GoozaliRow]):
self.applicationId = applicationId
self.id = id
self.name = name
self.columns = columns
self.primaryColumnId = primaryColumnId
self.meaningfulColumnOrder = meaningfulColumnOrder
self.viewOrder = viewOrder
self.rows = rows

View File

@ -0,0 +1,6 @@
from .GoozaliRow import GoozaliRow
from .GoozaliResponse import GoozaliResponse
from .GoozaliColumn import GoozaliColumn
from .GoozaliPartRequest import GoozaliPartRequest
from .FullRequest import GoozaliFullRequest
from .GoozaliColumnTypeOptions import GoozaliColumnTypeOptions

File diff suppressed because it is too large Load Diff

View File

@ -1,71 +1,28 @@
import json
import os
from jobspy import scrape_jobs
import pandas as pd
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliTable
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
from jobspy.scrapers.goozali.model import GoozaliResponse
# URL Example
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
def test_goozali():
result = scrape_jobs(
site_name="glassdoor",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"
def createMockGoozaliResponse() -> GoozaliResponse:
data = GoozaliResponseData(table=GoozaliTable(
applicationId="app7OQjqEzTtCRq7u",
id="tblBQjp5Aw6O172VY",
name="Shared view table",
columns=[
GoozaliColumn(
id="fldIf9DbRpNRLJXuD",
name="Industry",
description=None,
type="multiSelect",
typeOptions=GoozaliColumnTypeOptions(
choiceOrder=["selcE6QUv4vWIIcZR",
"sel0JIQKMmz3jCFUN", "selzhpwlfPssG4OEx"],
choices={
"selwhDNBom2dZJkgv": GoozaliColumnChoice(id="selwhDNBom2dZJkgv", name="HealthTech", color="orange"),
"selReHesNOVD3PvCo": GoozaliColumnChoice(id="selReHesNOVD3PvCo", name="Automotive", color="pink")
},
disableColors=False
),
default=None,
initialCreatedTime="2022-12-29T10:23:21.000Z",
initialCreatedByUserId="usr1fVy2RIyCuGHec",
lastModifiedTime="2024-07-21T09:30:02.000Z",
lastModifiedByUserId="usr1fVy2RIyCuGHec",
isEditableFromSync=False
)
],
primaryColumnId="fldLT11B0cpV6p9Uz",
meaningfulColumnOrder=[
{"columnId": "fldLT11B0cpV6p9Uz", "visibility": True},
{"columnId": "fldIf9DbRpNRLJXuD", "visibility": True, "width": 368},
{"columnId": "fldOLt34j8Pm2dcCq", "visibility": True, "width": 182}
],
viewOrder=["viwNRSqqmqZLP0a3C"],
rows=[
GoozaliRow(
id="recwiKgHT9mJrqoxa",
createdTime="2023-01-09T10:32:09.000Z",
cellValuesByColumnId={
"fldLT11B0cpV6p9Uz": ["3M"],
"fldIf9DbRpNRLJXuD": ["selwhDNBom2dZJkgv", "selReHesNOVD3PvCo"]
}
)
]
))
return GoozaliResponse(msg="SUCCESS", data=data)
try:
current_directory = os.getcwd()
file_path = os.path.join(current_directory, 'src',
'tests', 'goozali_response_example.json')
with open(file_path, 'r', encoding='ISO-8859-1') as file:
test_json_response = json.load(file)
print(test_json_response['msg']) # Output: Success
mapper = GoozaliMapper()
response = mapper._map_dict_to_goozali_response_data(
test_json_response['data'])
print("ya gever!!")
except FileNotFoundError:
print("The file was not found.")
except json.JSONDecodeError:
print("There was an error decoding the JSON data.")
except UnicodeDecodeError as e:
print(f"Unicode decode error: {e}")