mirror of https://github.com/Bunsly/JobSpy
added json file to test the mapper, from json response to classes
parent
b4454bfc65
commit
bbe3d6df35
|
@ -0,0 +1,77 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow
|
||||||
|
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
||||||
|
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
||||||
|
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
||||||
|
|
||||||
|
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
||||||
|
|
||||||
|
|
||||||
|
class GoozaliMapper:
|
||||||
|
def _map_dict_to_goozali_response_column_choice(self, column_choices: dict) -> dict[str, GoozaliColumnChoice]:
|
||||||
|
# Create a dictionary to store GoozaliColumnChoice objects
|
||||||
|
goolzali_column_choices: dict[str, GoozaliColumnChoice] = {}
|
||||||
|
|
||||||
|
# Map the data to GoozaliColumnChoice instances
|
||||||
|
for key, value in column_choices.items():
|
||||||
|
goolzali_column_choices[key] = GoozaliColumnChoice(
|
||||||
|
id=value['id'],
|
||||||
|
name=value['name'],
|
||||||
|
# Using get to safely access 'color', it may not always be present
|
||||||
|
color=value.get('color', "")
|
||||||
|
)
|
||||||
|
|
||||||
|
return goolzali_column_choices
|
||||||
|
|
||||||
|
def _map_dict_to_goozali_response_column_type_option(self, type_options: dict) -> GoozaliColumnTypeOptions:
|
||||||
|
goozali_type_options = GoozaliColumnTypeOptions(
|
||||||
|
typeOptions=type_options)
|
||||||
|
if goozali_type_options.choices:
|
||||||
|
goozali_type_options.choices = self._map_dict_to_goozali_response_column_choice(
|
||||||
|
goozali_type_options.choices)
|
||||||
|
|
||||||
|
return goozali_type_options
|
||||||
|
|
||||||
|
def _map_dict_to_goozali_response_columns(self, columns: list) -> list[GoozaliColumn]:
|
||||||
|
goozali_columns: list[GoozaliColumn] = []
|
||||||
|
for column in columns:
|
||||||
|
goozali_column = GoozaliColumn(**column)
|
||||||
|
if goozali_column.typeOptions:
|
||||||
|
goozali_column.typeOptions = self._map_dict_to_goozali_response_column_type_option(
|
||||||
|
goozali_column.typeOptions)
|
||||||
|
goozali_columns.append(goozali_column)
|
||||||
|
|
||||||
|
return goozali_columns
|
||||||
|
|
||||||
|
def _map_dict_to_goozali_response_data(self, data: dict) -> GoozaliResponseData:
|
||||||
|
|
||||||
|
columns = self._map_dict_to_goozali_response_columns(data['columns'])
|
||||||
|
rows = [GoozaliRow(**row) for row in data['rows']]
|
||||||
|
|
||||||
|
return GoozaliResponseData(
|
||||||
|
applicationId=data['applicationId'],
|
||||||
|
id=data['id'],
|
||||||
|
name=data['name'],
|
||||||
|
columns=columns,
|
||||||
|
primaryColumnId=data['primaryColumnId'],
|
||||||
|
meaningfulColumnOrder=data['meaningfulColumnOrder'],
|
||||||
|
viewOrder=data['viewOrder'],
|
||||||
|
rows=rows
|
||||||
|
)
|
||||||
|
|
||||||
|
# Updated map response function
|
||||||
|
|
||||||
|
def map_response_to_goozali_response(self, response) -> GoozaliResponse:
|
||||||
|
# Check the response content (this is a bytes object)
|
||||||
|
response_content = response.content
|
||||||
|
# Decode the byte content to a string
|
||||||
|
decoded_content = response_content.decode('utf-8')
|
||||||
|
# Now you can parse the decoded content as JSON
|
||||||
|
data = json.loads(decoded_content)
|
||||||
|
|
||||||
|
# Convert the 'data' dictionary into GoozaliResponseData object
|
||||||
|
data_obj = self._map_dict_to_goozali_response_data(data['data'])
|
||||||
|
|
||||||
|
# Return a new GoozaliResponse with msg and the converted data
|
||||||
|
return GoozaliResponse(msg=data['msg'], data=data_obj)
|
|
@ -6,11 +6,13 @@ This module contains routines to scrape Goozali.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
from jobspy.scrapers import Scraper, ScraperInput
|
from jobspy.scrapers import Scraper, ScraperInput
|
||||||
from jobspy.scrapers.goozali.model.GoozaliPartRequest import GoozaliPartRequest
|
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
||||||
from jobspy.scrapers.goozali.model.FullRequest import GoozaliFullRequest
|
from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn, GoozaliResponse, GoozaliPartRequest, GoozaliFullRequest
|
||||||
from jobspy.scrapers.site import Site
|
from jobspy.scrapers.site import Site
|
||||||
|
|
||||||
from ..utils import create_session, create_logger
|
from ..utils import create_session, create_logger
|
||||||
|
@ -42,9 +44,55 @@ class GoozaliScraper(Scraper):
|
||||||
delay=5,
|
delay=5,
|
||||||
clear_cookies=False,
|
clear_cookies=False,
|
||||||
)
|
)
|
||||||
|
self.mapper = GoozaliMapper()
|
||||||
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
||||||
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
|
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
|
||||||
|
|
||||||
|
def map_respone_to_goozali_response(self, response) -> GoozaliResponse:
|
||||||
|
# Check the response content (this is a bytes object)
|
||||||
|
response_content = response.content
|
||||||
|
# Decode the byte content to a string
|
||||||
|
decoded_content = response_content.decode('utf-8')
|
||||||
|
# Now you can parse the decoded content as JSON
|
||||||
|
data = json.loads(decoded_content)
|
||||||
|
|
||||||
|
return GoozaliResponse(**data)
|
||||||
|
|
||||||
|
# Function to filter GoozaliRows based on hours old
|
||||||
|
def filter_rows_by_hours(rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
|
||||||
|
# Current time
|
||||||
|
now = datetime.utcnow()
|
||||||
|
|
||||||
|
# Calculate the time delta for the given hours
|
||||||
|
time_delta = datetime.timedelta(hours=hours)
|
||||||
|
|
||||||
|
# Filter rows
|
||||||
|
filtered_rows = [
|
||||||
|
row for row in rows
|
||||||
|
if now - datetime.strptime(row.createdTime, "%Y-%m-%dT%H:%M:%S.%fZ") <= time_delta
|
||||||
|
]
|
||||||
|
|
||||||
|
return filtered_rows
|
||||||
|
|
||||||
|
def find_column(self, columns: list[GoozaliColumn], column_name: str) -> GoozaliColumn:
|
||||||
|
for column in columns:
|
||||||
|
if (column.name == column_name):
|
||||||
|
return column
|
||||||
|
# def filter_rows_by_field_column(rows: list[GoozaliRow], field_column: Column) -> list[GoozaliRow]:
|
||||||
|
# # Current time
|
||||||
|
# now = datetime.utcnow()
|
||||||
|
|
||||||
|
# # Calculate the time delta for the given hours
|
||||||
|
# time_delta = datetime.timedelta(hours=hours)
|
||||||
|
|
||||||
|
# # Filter rows
|
||||||
|
# filtered_rows = [
|
||||||
|
# row for row in rows
|
||||||
|
# if now - datetime.strptime(row.createdTime, "%Y-%m-%dT%H:%M:%S.%fZ") <= time_delta
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# return filtered_rows
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
Scrapes Goozali for jobs with scraper_input criteria
|
Scrapes Goozali for jobs with scraper_input criteria
|
||||||
|
@ -66,14 +114,23 @@ class GoozaliScraper(Scraper):
|
||||||
cookies=full_request.cookies)
|
cookies=full_request.cookies)
|
||||||
logger.info(f"response: {str(response)}")
|
logger.info(f"response: {str(response)}")
|
||||||
if (response.status_code != 200):
|
if (response.status_code != 200):
|
||||||
logger.error(f"Status code: {
|
logger.error(f"Status code: {response.status_code}, Error: {
|
||||||
response.status_code}, Error: {str(response.text)}")
|
str(response.text)}")
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Exception: {str(e)}")
|
logger.error(f"Exception: {str(e)}")
|
||||||
# model the response with models
|
# model the response with models
|
||||||
|
goozali_response = self.mapper.map_response_to_goozali_response(
|
||||||
|
response=response)
|
||||||
|
# goozali_response: GoozaliResponse = self.map_respone_to_goozali_response(
|
||||||
|
# response)
|
||||||
# create map columnId to Column object
|
# create map columnId to Column object
|
||||||
|
field = self.find_column(
|
||||||
|
goozali_response.data.columns, "Field")
|
||||||
|
|
||||||
# filter result by Field like the web
|
# filter result by Field like the web
|
||||||
# filter by date
|
# filter by date
|
||||||
|
# filtered_rows_by_age = filter_rows_by_hours(
|
||||||
|
# goozali_response.data.table.rows, scraper_input.hours_old)
|
||||||
# map to JobResponse Object
|
# map to JobResponse Object
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
|
@ -1,10 +1,23 @@
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
||||||
|
|
||||||
|
|
||||||
class GoozaliColumnTypeOptions:
|
class GoozaliColumnTypeOptions:
|
||||||
def __init__(self, choiceOrder: List[str], choices: Dict[str, GoozaliColumnChoice], disableColors: bool):
|
def __init__(self, choiceOrder: list[str], choices: dict[str, GoozaliColumnChoice], disableColors: bool):
|
||||||
self.choiceOrder = choiceOrder
|
self.choiceOrder = choiceOrder
|
||||||
self.choices = choices
|
self.choices = choices
|
||||||
self.disableColors = disableColors
|
self.disableColors = disableColors
|
||||||
|
|
||||||
|
def __init__(self, typeOptions: dict):
|
||||||
|
self.choiceOrder = typeOptions.get("choiceOrder", [])
|
||||||
|
self.choices: dict[str, GoozaliColumnChoice] = typeOptions.get(
|
||||||
|
"choices", {})
|
||||||
|
self.disableColors = typeOptions.get("disableColors", False)
|
||||||
|
self.dateFormat = typeOptions.get("dateFormat", "")
|
||||||
|
self.isDateTime = typeOptions.get("isDateTime", False)
|
||||||
|
self.timeZone = typeOptions.get("timeZone", "")
|
||||||
|
self.shouldDisplayTimeZone = typeOptions.get(
|
||||||
|
"shouldDisplayTimeZone", False)
|
||||||
|
self.formulaTextParsed = typeOptions.get("formulaTextParsed", "")
|
||||||
|
self.dependencies = typeOptions.get("dependencies", [])
|
||||||
|
self.resultType = typeOptions.get("resultType", "")
|
||||||
|
self.resultIsArray = typeOptions.get("resultIsArray", False)
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
|
||||||
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
|
|
||||||
|
|
||||||
|
|
||||||
class GoozaliTable:
|
|
||||||
def __init__(self, applicationId: str, id: str, name: str, columns: List[GoozaliColumn], primaryColumnId: str,
|
|
||||||
meaningfulColumnOrder: List[Dict[str, str]], viewOrder: List[str], rows: List[GoozaliRow]):
|
|
||||||
self.applicationId = applicationId
|
|
||||||
self.id = id
|
|
||||||
self.name = name
|
|
||||||
self.columns = columns
|
|
||||||
self.primaryColumnId = primaryColumnId
|
|
||||||
self.meaningfulColumnOrder = meaningfulColumnOrder
|
|
||||||
self.viewOrder = viewOrder
|
|
||||||
self.rows = rows
|
|
|
@ -1,6 +1,15 @@
|
||||||
from jobspy.scrapers.goozali.model import GoozaliTable
|
from jobspy.scrapers.goozali.model import GoozaliRow
|
||||||
|
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
||||||
|
|
||||||
|
|
||||||
class GoozaliResponseData:
|
class GoozaliResponseData:
|
||||||
def __init__(self, table: GoozaliTable):
|
def __init__(self, applicationId: str, id: str, name: str, columns: list[GoozaliColumn], primaryColumnId: str,
|
||||||
self.table = table
|
meaningfulColumnOrder: list[dict[str, str]], viewOrder: list[str], rows: list[GoozaliRow]):
|
||||||
|
self.applicationId = applicationId
|
||||||
|
self.id = id
|
||||||
|
self.name = name
|
||||||
|
self.columns = columns
|
||||||
|
self.primaryColumnId = primaryColumnId
|
||||||
|
self.meaningfulColumnOrder = meaningfulColumnOrder
|
||||||
|
self.viewOrder = viewOrder
|
||||||
|
self.rows = rows
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
from .GoozaliRow import GoozaliRow
|
||||||
|
from .GoozaliResponse import GoozaliResponse
|
||||||
|
from .GoozaliColumn import GoozaliColumn
|
||||||
|
from .GoozaliPartRequest import GoozaliPartRequest
|
||||||
|
from .FullRequest import GoozaliFullRequest
|
||||||
|
from .GoozaliColumnTypeOptions import GoozaliColumnTypeOptions
|
File diff suppressed because it is too large
Load Diff
|
@ -1,71 +1,28 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
from jobspy import scrape_jobs
|
from jobspy import scrape_jobs
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliTable
|
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
from jobspy.scrapers.goozali.model import GoozaliResponse
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
|
||||||
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
|
|
||||||
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
|
||||||
|
|
||||||
# URL Example
|
# URL Example
|
||||||
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
|
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
|
||||||
|
|
||||||
|
|
||||||
def test_goozali():
|
try:
|
||||||
result = scrape_jobs(
|
current_directory = os.getcwd()
|
||||||
site_name="glassdoor",
|
file_path = os.path.join(current_directory, 'src',
|
||||||
search_term="engineer",
|
'tests', 'goozali_response_example.json')
|
||||||
results_wanted=5,
|
with open(file_path, 'r', encoding='ISO-8859-1') as file:
|
||||||
)
|
test_json_response = json.load(file)
|
||||||
assert (
|
print(test_json_response['msg']) # Output: Success
|
||||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
mapper = GoozaliMapper()
|
||||||
), "Result should be a non-empty DataFrame"
|
response = mapper._map_dict_to_goozali_response_data(
|
||||||
|
test_json_response['data'])
|
||||||
|
print("ya gever!!")
|
||||||
def createMockGoozaliResponse() -> GoozaliResponse:
|
except FileNotFoundError:
|
||||||
data = GoozaliResponseData(table=GoozaliTable(
|
print("The file was not found.")
|
||||||
applicationId="app7OQjqEzTtCRq7u",
|
except json.JSONDecodeError:
|
||||||
id="tblBQjp5Aw6O172VY",
|
print("There was an error decoding the JSON data.")
|
||||||
name="Shared view table",
|
except UnicodeDecodeError as e:
|
||||||
columns=[
|
print(f"Unicode decode error: {e}")
|
||||||
GoozaliColumn(
|
|
||||||
id="fldIf9DbRpNRLJXuD",
|
|
||||||
name="Industry",
|
|
||||||
description=None,
|
|
||||||
type="multiSelect",
|
|
||||||
typeOptions=GoozaliColumnTypeOptions(
|
|
||||||
choiceOrder=["selcE6QUv4vWIIcZR",
|
|
||||||
"sel0JIQKMmz3jCFUN", "selzhpwlfPssG4OEx"],
|
|
||||||
choices={
|
|
||||||
"selwhDNBom2dZJkgv": GoozaliColumnChoice(id="selwhDNBom2dZJkgv", name="HealthTech", color="orange"),
|
|
||||||
"selReHesNOVD3PvCo": GoozaliColumnChoice(id="selReHesNOVD3PvCo", name="Automotive", color="pink")
|
|
||||||
},
|
|
||||||
disableColors=False
|
|
||||||
),
|
|
||||||
default=None,
|
|
||||||
initialCreatedTime="2022-12-29T10:23:21.000Z",
|
|
||||||
initialCreatedByUserId="usr1fVy2RIyCuGHec",
|
|
||||||
lastModifiedTime="2024-07-21T09:30:02.000Z",
|
|
||||||
lastModifiedByUserId="usr1fVy2RIyCuGHec",
|
|
||||||
isEditableFromSync=False
|
|
||||||
)
|
|
||||||
],
|
|
||||||
primaryColumnId="fldLT11B0cpV6p9Uz",
|
|
||||||
meaningfulColumnOrder=[
|
|
||||||
{"columnId": "fldLT11B0cpV6p9Uz", "visibility": True},
|
|
||||||
{"columnId": "fldIf9DbRpNRLJXuD", "visibility": True, "width": 368},
|
|
||||||
{"columnId": "fldOLt34j8Pm2dcCq", "visibility": True, "width": 182}
|
|
||||||
],
|
|
||||||
viewOrder=["viwNRSqqmqZLP0a3C"],
|
|
||||||
rows=[
|
|
||||||
GoozaliRow(
|
|
||||||
id="recwiKgHT9mJrqoxa",
|
|
||||||
createdTime="2023-01-09T10:32:09.000Z",
|
|
||||||
cellValuesByColumnId={
|
|
||||||
"fldLT11B0cpV6p9Uz": ["3M"],
|
|
||||||
"fldIf9DbRpNRLJXuD": ["selwhDNBom2dZJkgv", "selReHesNOVD3PvCo"]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
))
|
|
||||||
return GoozaliResponse(msg="SUCCESS", data=data)
|
|
||||||
|
|
Loading…
Reference in New Issue