mirror of https://github.com/Bunsly/JobSpy
added json file to test the mapper, from json response to classes
parent
b4454bfc65
commit
bbe3d6df35
|
@ -0,0 +1,77 @@
|
|||
import json
|
||||
|
||||
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow
|
||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
||||
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
||||
|
||||
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
||||
|
||||
|
||||
class GoozaliMapper:
|
||||
def _map_dict_to_goozali_response_column_choice(self, column_choices: dict) -> dict[str, GoozaliColumnChoice]:
|
||||
# Create a dictionary to store GoozaliColumnChoice objects
|
||||
goolzali_column_choices: dict[str, GoozaliColumnChoice] = {}
|
||||
|
||||
# Map the data to GoozaliColumnChoice instances
|
||||
for key, value in column_choices.items():
|
||||
goolzali_column_choices[key] = GoozaliColumnChoice(
|
||||
id=value['id'],
|
||||
name=value['name'],
|
||||
# Using get to safely access 'color', it may not always be present
|
||||
color=value.get('color', "")
|
||||
)
|
||||
|
||||
return goolzali_column_choices
|
||||
|
||||
def _map_dict_to_goozali_response_column_type_option(self, type_options: dict) -> GoozaliColumnTypeOptions:
|
||||
goozali_type_options = GoozaliColumnTypeOptions(
|
||||
typeOptions=type_options)
|
||||
if goozali_type_options.choices:
|
||||
goozali_type_options.choices = self._map_dict_to_goozali_response_column_choice(
|
||||
goozali_type_options.choices)
|
||||
|
||||
return goozali_type_options
|
||||
|
||||
def _map_dict_to_goozali_response_columns(self, columns: list) -> list[GoozaliColumn]:
|
||||
goozali_columns: list[GoozaliColumn] = []
|
||||
for column in columns:
|
||||
goozali_column = GoozaliColumn(**column)
|
||||
if goozali_column.typeOptions:
|
||||
goozali_column.typeOptions = self._map_dict_to_goozali_response_column_type_option(
|
||||
goozali_column.typeOptions)
|
||||
goozali_columns.append(goozali_column)
|
||||
|
||||
return goozali_columns
|
||||
|
||||
def _map_dict_to_goozali_response_data(self, data: dict) -> GoozaliResponseData:
|
||||
|
||||
columns = self._map_dict_to_goozali_response_columns(data['columns'])
|
||||
rows = [GoozaliRow(**row) for row in data['rows']]
|
||||
|
||||
return GoozaliResponseData(
|
||||
applicationId=data['applicationId'],
|
||||
id=data['id'],
|
||||
name=data['name'],
|
||||
columns=columns,
|
||||
primaryColumnId=data['primaryColumnId'],
|
||||
meaningfulColumnOrder=data['meaningfulColumnOrder'],
|
||||
viewOrder=data['viewOrder'],
|
||||
rows=rows
|
||||
)
|
||||
|
||||
# Updated map response function
|
||||
|
||||
def map_response_to_goozali_response(self, response) -> GoozaliResponse:
|
||||
# Check the response content (this is a bytes object)
|
||||
response_content = response.content
|
||||
# Decode the byte content to a string
|
||||
decoded_content = response_content.decode('utf-8')
|
||||
# Now you can parse the decoded content as JSON
|
||||
data = json.loads(decoded_content)
|
||||
|
||||
# Convert the 'data' dictionary into GoozaliResponseData object
|
||||
data_obj = self._map_dict_to_goozali_response_data(data['data'])
|
||||
|
||||
# Return a new GoozaliResponse with msg and the converted data
|
||||
return GoozaliResponse(msg=data['msg'], data=data_obj)
|
|
@ -6,11 +6,13 @@ This module contains routines to scrape Goozali.
|
|||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import datetime
|
||||
import json
|
||||
|
||||
|
||||
from jobspy.scrapers import Scraper, ScraperInput
|
||||
from jobspy.scrapers.goozali.model.GoozaliPartRequest import GoozaliPartRequest
|
||||
from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest
|
||||
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
||||
from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn, GoozaliResponse, GoozaliPartRequest, GoozaliFullRequest
|
||||
from jobspy.scrapers.site import Site
|
||||
|
||||
from ..utils import create_session, create_logger
|
||||
|
@ -42,9 +44,55 @@ class GoozaliScraper(Scraper):
|
|||
delay=5,
|
||||
clear_cookies=False,
|
||||
)
|
||||
self.mapper = GoozaliMapper()
|
||||
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
||||
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
|
||||
|
||||
def map_respone_to_goozali_response(self, response) -> GoozaliResponse:
|
||||
# Check the response content (this is a bytes object)
|
||||
response_content = response.content
|
||||
# Decode the byte content to a string
|
||||
decoded_content = response_content.decode('utf-8')
|
||||
# Now you can parse the decoded content as JSON
|
||||
data = json.loads(decoded_content)
|
||||
|
||||
return GoozaliResponse(**data)
|
||||
|
||||
# Function to filter GoozaliRows based on hours old
|
||||
def filter_rows_by_hours(rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
|
||||
# Current time
|
||||
now = datetime.utcnow()
|
||||
|
||||
# Calculate the time delta for the given hours
|
||||
time_delta = datetime.timedelta(hours=hours)
|
||||
|
||||
# Filter rows
|
||||
filtered_rows = [
|
||||
row for row in rows
|
||||
if now - datetime.strptime(row.createdTime, "%Y-%m-%dT%H:%M:%S.%fZ") <= time_delta
|
||||
]
|
||||
|
||||
return filtered_rows
|
||||
|
||||
def find_column(self, columns: list[GoozaliColumn], column_name: str) -> GoozaliColumn:
|
||||
for column in columns:
|
||||
if (column.name == column_name):
|
||||
return column
|
||||
# def filter_rows_by_field_column(rows: list[GoozaliRow], field_column: Column) -> list[GoozaliRow]:
|
||||
# # Current time
|
||||
# now = datetime.utcnow()
|
||||
|
||||
# # Calculate the time delta for the given hours
|
||||
# time_delta = datetime.timedelta(hours=hours)
|
||||
|
||||
# # Filter rows
|
||||
# filtered_rows = [
|
||||
# row for row in rows
|
||||
# if now - datetime.strptime(row.createdTime, "%Y-%m-%dT%H:%M:%S.%fZ") <= time_delta
|
||||
# ]
|
||||
|
||||
# return filtered_rows
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes Goozali for jobs with scraper_input criteria
|
||||
|
@ -66,14 +114,23 @@ class GoozaliScraper(Scraper):
|
|||
cookies=full_request.cookies)
|
||||
logger.info(f"response: {str(response)}")
|
||||
if (response.status_code != 200):
|
||||
logger.error(f"Status code: {
|
||||
response.status_code}, Error: {str(response.text)}")
|
||||
logger.error(f"Status code: {response.status_code}, Error: {
|
||||
str(response.text)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
except Exception as e:
|
||||
logger.error(f"Exception: {str(e)}")
|
||||
# model the response with models
|
||||
goozali_response = self.mapper.map_response_to_goozali_response(
|
||||
response=response)
|
||||
# goozali_response: GoozaliResponse = self.map_respone_to_goozali_response(
|
||||
# response)
|
||||
# create map columnId to Column object
|
||||
field = self.find_column(
|
||||
goozali_response.data.columns, "Field")
|
||||
|
||||
# filter result by Field like the web
|
||||
# filter by date
|
||||
# filtered_rows_by_age = filter_rows_by_hours(
|
||||
# goozali_response.data.table.rows, scraper_input.hours_old)
|
||||
# map to JobResponse Object
|
||||
return JobResponse(jobs=job_list)
|
||||
|
|
|
@ -1,10 +1,23 @@
|
|||
from typing import Dict, List
|
||||
|
||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
||||
|
||||
|
||||
class GoozaliColumnTypeOptions:
|
||||
def __init__(self, choiceOrder: List[str], choices: Dict[str, GoozaliColumnChoice], disableColors: bool):
|
||||
def __init__(self, choiceOrder: list[str], choices: dict[str, GoozaliColumnChoice], disableColors: bool):
|
||||
self.choiceOrder = choiceOrder
|
||||
self.choices = choices
|
||||
self.disableColors = disableColors
|
||||
|
||||
def __init__(self, typeOptions: dict):
|
||||
self.choiceOrder = typeOptions.get("choiceOrder", [])
|
||||
self.choices: dict[str, GoozaliColumnChoice] = typeOptions.get(
|
||||
"choices", {})
|
||||
self.disableColors = typeOptions.get("disableColors", False)
|
||||
self.dateFormat = typeOptions.get("dateFormat", "")
|
||||
self.isDateTime = typeOptions.get("isDateTime", False)
|
||||
self.timeZone = typeOptions.get("timeZone", "")
|
||||
self.shouldDisplayTimeZone = typeOptions.get(
|
||||
"shouldDisplayTimeZone", False)
|
||||
self.formulaTextParsed = typeOptions.get("formulaTextParsed", "")
|
||||
self.dependencies = typeOptions.get("dependencies", [])
|
||||
self.resultType = typeOptions.get("resultType", "")
|
||||
self.resultIsArray = typeOptions.get("resultIsArray", False)
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
from typing import Dict, List
|
||||
|
||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
||||
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
|
||||
|
||||
|
||||
class GoozaliTable:
|
||||
def __init__(self, applicationId: str, id: str, name: str, columns: List[GoozaliColumn], primaryColumnId: str,
|
||||
meaningfulColumnOrder: List[Dict[str, str]], viewOrder: List[str], rows: List[GoozaliRow]):
|
||||
self.applicationId = applicationId
|
||||
self.id = id
|
||||
self.name = name
|
||||
self.columns = columns
|
||||
self.primaryColumnId = primaryColumnId
|
||||
self.meaningfulColumnOrder = meaningfulColumnOrder
|
||||
self.viewOrder = viewOrder
|
||||
self.rows = rows
|
|
@ -1,6 +1,15 @@
|
|||
from jobspy.scrapers.goozali.model import GoozaliTable
|
||||
from jobspy.scrapers.goozali.model import GoozaliRow
|
||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
||||
|
||||
|
||||
class GoozaliResponseData:
|
||||
def __init__(self, table: GoozaliTable):
|
||||
self.table = table
|
||||
def __init__(self, applicationId: str, id: str, name: str, columns: list[GoozaliColumn], primaryColumnId: str,
|
||||
meaningfulColumnOrder: list[dict[str, str]], viewOrder: list[str], rows: list[GoozaliRow]):
|
||||
self.applicationId = applicationId
|
||||
self.id = id
|
||||
self.name = name
|
||||
self.columns = columns
|
||||
self.primaryColumnId = primaryColumnId
|
||||
self.meaningfulColumnOrder = meaningfulColumnOrder
|
||||
self.viewOrder = viewOrder
|
||||
self.rows = rows
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
from .GoozaliRow import GoozaliRow
|
||||
from .GoozaliResponse import GoozaliResponse
|
||||
from .GoozaliColumn import GoozaliColumn
|
||||
from .GoozaliPartRequest import GoozaliPartRequest
|
||||
from .FullRequest import GoozaliFullRequest
|
||||
from .GoozaliColumnTypeOptions import GoozaliColumnTypeOptions
|
File diff suppressed because it is too large
Load Diff
|
@ -1,71 +1,28 @@
|
|||
import json
|
||||
import os
|
||||
from jobspy import scrape_jobs
|
||||
import pandas as pd
|
||||
|
||||
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliTable
|
||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
||||
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
|
||||
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
||||
|
||||
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
||||
from jobspy.scrapers.goozali.model import GoozaliResponse
|
||||
# URL Example
|
||||
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
|
||||
|
||||
|
||||
def test_goozali():
|
||||
result = scrape_jobs(
|
||||
site_name="glassdoor",
|
||||
search_term="engineer",
|
||||
results_wanted=5,
|
||||
)
|
||||
assert (
|
||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||
), "Result should be a non-empty DataFrame"
|
||||
|
||||
|
||||
def createMockGoozaliResponse() -> GoozaliResponse:
|
||||
data = GoozaliResponseData(table=GoozaliTable(
|
||||
applicationId="app7OQjqEzTtCRq7u",
|
||||
id="tblBQjp5Aw6O172VY",
|
||||
name="Shared view table",
|
||||
columns=[
|
||||
GoozaliColumn(
|
||||
id="fldIf9DbRpNRLJXuD",
|
||||
name="Industry",
|
||||
description=None,
|
||||
type="multiSelect",
|
||||
typeOptions=GoozaliColumnTypeOptions(
|
||||
choiceOrder=["selcE6QUv4vWIIcZR",
|
||||
"sel0JIQKMmz3jCFUN", "selzhpwlfPssG4OEx"],
|
||||
choices={
|
||||
"selwhDNBom2dZJkgv": GoozaliColumnChoice(id="selwhDNBom2dZJkgv", name="HealthTech", color="orange"),
|
||||
"selReHesNOVD3PvCo": GoozaliColumnChoice(id="selReHesNOVD3PvCo", name="Automotive", color="pink")
|
||||
},
|
||||
disableColors=False
|
||||
),
|
||||
default=None,
|
||||
initialCreatedTime="2022-12-29T10:23:21.000Z",
|
||||
initialCreatedByUserId="usr1fVy2RIyCuGHec",
|
||||
lastModifiedTime="2024-07-21T09:30:02.000Z",
|
||||
lastModifiedByUserId="usr1fVy2RIyCuGHec",
|
||||
isEditableFromSync=False
|
||||
)
|
||||
],
|
||||
primaryColumnId="fldLT11B0cpV6p9Uz",
|
||||
meaningfulColumnOrder=[
|
||||
{"columnId": "fldLT11B0cpV6p9Uz", "visibility": True},
|
||||
{"columnId": "fldIf9DbRpNRLJXuD", "visibility": True, "width": 368},
|
||||
{"columnId": "fldOLt34j8Pm2dcCq", "visibility": True, "width": 182}
|
||||
],
|
||||
viewOrder=["viwNRSqqmqZLP0a3C"],
|
||||
rows=[
|
||||
GoozaliRow(
|
||||
id="recwiKgHT9mJrqoxa",
|
||||
createdTime="2023-01-09T10:32:09.000Z",
|
||||
cellValuesByColumnId={
|
||||
"fldLT11B0cpV6p9Uz": ["3M"],
|
||||
"fldIf9DbRpNRLJXuD": ["selwhDNBom2dZJkgv", "selReHesNOVD3PvCo"]
|
||||
}
|
||||
)
|
||||
]
|
||||
))
|
||||
return GoozaliResponse(msg="SUCCESS", data=data)
|
||||
try:
|
||||
current_directory = os.getcwd()
|
||||
file_path = os.path.join(current_directory, 'src',
|
||||
'tests', 'goozali_response_example.json')
|
||||
with open(file_path, 'r', encoding='ISO-8859-1') as file:
|
||||
test_json_response = json.load(file)
|
||||
print(test_json_response['msg']) # Output: Success
|
||||
mapper = GoozaliMapper()
|
||||
response = mapper._map_dict_to_goozali_response_data(
|
||||
test_json_response['data'])
|
||||
print("ya gever!!")
|
||||
except FileNotFoundError:
|
||||
print("The file was not found.")
|
||||
except json.JSONDecodeError:
|
||||
print("There was an error decoding the JSON data.")
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Unicode decode error: {e}")
|
||||
|
|
Loading…
Reference in New Issue