mirror of https://github.com/Bunsly/JobSpy
created new component to able test it properly
parent
7926356233
commit
7971d0fab7
|
@ -0,0 +1,49 @@
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
from jobspy.scrapers.goozali.model import GoozaliRow
|
||||||
|
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
||||||
|
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
||||||
|
from jobspy.scrapers.utils import create_logger
|
||||||
|
|
||||||
|
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
||||||
|
|
||||||
|
logger = create_logger("GoozaliScrapperComponent")
|
||||||
|
|
||||||
|
|
||||||
|
class GoozaliScrapperComponent:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Function to filter GoozaliRows based on hours old
|
||||||
|
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
|
||||||
|
# Current time
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
# Calculate the time delta for the given hours
|
||||||
|
time_delta = datetime.timedelta(hours=hours)
|
||||||
|
|
||||||
|
# Filter rows
|
||||||
|
filtered_rows = [
|
||||||
|
row for row in rows
|
||||||
|
if now - row.createdTime <= time_delta
|
||||||
|
]
|
||||||
|
|
||||||
|
return filtered_rows
|
||||||
|
|
||||||
|
def find_column(self, columns: list[GoozaliColumn], column_name: str) -> GoozaliColumn:
|
||||||
|
for column in columns:
|
||||||
|
if (column.name == column_name):
|
||||||
|
return column
|
||||||
|
|
||||||
|
def find_choice_from_column(self, column: GoozaliColumn, choice_name: str) -> GoozaliColumnChoice:
|
||||||
|
if not column.typeOptions.choices:
|
||||||
|
logger.exception(f"Choices for column {column.name} doesn't exist")
|
||||||
|
raise Exception(f"Choices for column {column.name} doesn't exist")
|
||||||
|
|
||||||
|
for key, choice in column.typeOptions.choices.items():
|
||||||
|
if (choice.name == choice_name):
|
||||||
|
return choice
|
||||||
|
|
||||||
|
logger.exception(f"Can't find {choice_name} for column {column.name}")
|
||||||
|
raise Exception(f"Can't find {choice_name} for column {column.name}")
|
|
@ -12,7 +12,9 @@ import json
|
||||||
|
|
||||||
from jobspy.scrapers import Scraper, ScraperInput
|
from jobspy.scrapers import Scraper, ScraperInput
|
||||||
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
||||||
|
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
|
||||||
from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn, GoozaliResponse, GoozaliPartRequest, GoozaliFullRequest
|
from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn, GoozaliResponse, GoozaliPartRequest, GoozaliFullRequest
|
||||||
|
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
||||||
from jobspy.scrapers.site import Site
|
from jobspy.scrapers.site import Site
|
||||||
|
|
||||||
from ..utils import create_session, create_logger
|
from ..utils import create_session, create_logger
|
||||||
|
@ -47,47 +49,7 @@ class GoozaliScraper(Scraper):
|
||||||
self.mapper = GoozaliMapper()
|
self.mapper = GoozaliMapper()
|
||||||
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
||||||
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
|
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
|
||||||
|
self.component = GoozaliScrapperComponent()
|
||||||
def map_respone_to_goozali_response(self, response) -> GoozaliResponse:
|
|
||||||
# Check the response content (this is a bytes object)
|
|
||||||
response_content = response.content
|
|
||||||
# Decode the byte content to a string
|
|
||||||
decoded_content = response_content.decode('utf-8')
|
|
||||||
# Now you can parse the decoded content as JSON
|
|
||||||
data = json.loads(decoded_content)
|
|
||||||
|
|
||||||
return GoozaliResponse(**data)
|
|
||||||
|
|
||||||
# Function to filter GoozaliRows based on hours old
|
|
||||||
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
|
|
||||||
# Current time
|
|
||||||
now = datetime.datetime.now()
|
|
||||||
|
|
||||||
# Calculate the time delta for the given hours
|
|
||||||
time_delta = datetime.timedelta(hours=hours)
|
|
||||||
|
|
||||||
# Filter rows
|
|
||||||
filtered_rows = [
|
|
||||||
row for row in rows
|
|
||||||
if now - row.createdTime <= time_delta
|
|
||||||
]
|
|
||||||
|
|
||||||
return filtered_rows
|
|
||||||
|
|
||||||
def find_column(self, columns: list[GoozaliColumn], column_name: str) -> GoozaliColumn:
|
|
||||||
for column in columns:
|
|
||||||
if (column.name == column_name):
|
|
||||||
return column
|
|
||||||
|
|
||||||
# def filter_rows_by_column(rows: list[GoozaliRow], goozali_column: GoozaliColumn) -> list[GoozaliRow]:
|
|
||||||
|
|
||||||
# # Filter rows
|
|
||||||
# filtered_rows = [
|
|
||||||
# row for row in rows
|
|
||||||
# if row.cellValuesByColumnId[goozali_column.id] == goozali_column.
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# return filtered_rows
|
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -118,12 +80,15 @@ class GoozaliScraper(Scraper):
|
||||||
# model the response with models
|
# model the response with models
|
||||||
goozali_response = self.mapper.map_response_to_goozali_response(
|
goozali_response = self.mapper.map_response_to_goozali_response(
|
||||||
response=response)
|
response=response)
|
||||||
# filter by date
|
# suggestL create groupby field and then filter by hours
|
||||||
filtered_rows_by_age = self.filter_rows_by_hours(
|
|
||||||
goozali_response.data.rows, scraper_input.hours_old)
|
|
||||||
# filter result by Field like the web
|
# filter result by Field like the web
|
||||||
field_cloumn = self.find_column(
|
field_cloumn = self.component.find_column(
|
||||||
goozali_response.data.columns, "Field")
|
goozali_response.data.columns, "Field")
|
||||||
|
software_engineering_choice = self.component.find_choice_from_column(
|
||||||
|
field_cloumn, "Software Engineering")
|
||||||
|
# filter by date
|
||||||
|
filtered_rows_by_age = self.component.filter_rows_by_hours(
|
||||||
|
goozali_response.data.rows, scraper_input.hours_old)
|
||||||
# map to JobResponse Object
|
# map to JobResponse Object
|
||||||
|
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
Loading…
Reference in New Issue