Skip to content

Commit

Permalink
now with using nbdev_prepare
Browse files Browse the repository at this point in the history
  • Loading branch information
miTTimmiTTim committed Nov 26, 2024
1 parent 64e2176 commit ab22103
Show file tree
Hide file tree
Showing 6 changed files with 1,658 additions and 308 deletions.
132 changes: 105 additions & 27 deletions ddopai/_modidx.py

Large diffs are not rendered by default.

336 changes: 336 additions & 0 deletions ddopai/dataloaders/online.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
"""Dataloaders for Online env"""

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/10_dataloaders/13_online_dataloaders.ipynb.

# %% auto 0
__all__ = ['OnlineDataLoader', 'normalize_features', 'prep_lag_features', 'update_lag_features', 'X_shape', 'Y_shape', 'len_val',
'len_test', 'get_all_X', 'get_all_Y']

# %% ../../nbs/10_dataloaders/13_online_dataloaders.ipynb 3
import logging
logging.basicConfig(level=logging.INFO)

import numpy as np
from abc import ABC, abstractmethod
from typing import Union, Tuple, List, Literal
import pandas as pd
import math
from scipy.stats import norm
from .base import BaseDataLoader

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# %% ../../nbs/10_dataloaders/13_online_dataloaders.ipynb 4
class OnlineDataLoader(BaseDataLoader):

"""
A class for online data depending on the action taken.
X is an numpy array and Y is a function dependent on alpha, beta, function_form and epsilon.
X may be of shape (datapoints, features) TODO: Lag features
Y is of shape (datapoints, units) and takes as input (X, action).
"""

def __init__(self,
X: np.ndarray,
alpha: float | np.ndarray,
beta: float | np.ndarray,
epsilon: np.ndarray,
function_form: str | np.ndarray = 'linear',
val_index_start: int = None,
test_index_start: int = None,
normalize_features: dict = None,
):
self.X = X
self.alpha = alpha
self.beta = beta
self.epsilon = epsilon

self.function_form = function_form

self.val_index_start = val_index_start
self.test_index_start = test_index_start

# train index ends either at the start of the validation set, the start of the test set or at the end of the dataset
if self.val_index_start is not None:
self.train_index_end = self.val_index_start-1
elif self.test_index_start is not None:
self.train_index_end = self.test_index_start-1
else:
self.train_index_end = len(Y)-1

self.dataset_type = "train"

normalize_features = normalize_features or {'normalize': True, 'ignore_one_hot': True}

self.normalize_features(**normalize_features, initial_normalization=True)

# X must at least have datapoint and feature dimension
if len(X.shape) == 1:
self.X = X.reshape(-1, 1)

if len(epsilon.shape) == 1:
self.epsilon = epsilon.reshape(-1, 1)

if isinstance(alpha, np.ndarray) and len(alpha.shape) == 1:
self.alpha = alpha.reshape(-1, 1)

if isinstance(beta, np.ndarray) and len(beta.shape) == 1:
self.beta = beta.reshape(-1, 1)

if isinstance(function_form, np.ndarray) and len(function_form.shape) == 1:
self.function_form = function_form.reshape(-1, 1)

if isinstance(self.alpha, np.ndarray) and isinstance(self.beta, np.ndarray) and isinstance(self.function_form, np.ndarray):
assert self.alpha.shape[0] == self.beta.shape[0] == self.function_form.shape[0] == self.X.shape[0] == self.epsilon[0], "alpha, beta, X, epsilon and function_form must have the same length"
else:
assert self.X.shape[0] == self.epsilon.shape[0], "X and epsilon must have the same length"

self.num_units = self.epsilon.shape[1]

super().__init__()

def normalize_features(self,
normalize: bool = True,
ignore_one_hot: bool = True,
initial_normalization: bool = False
):
"""
Normalize features using a standard scalar. If ignore_one_hot is True, one-hot encoded columns are not normalized.
"""

if normalize:

scalar = StandardScaler()

if initial_normalization:

if len(self.X.shape) == 3:
raise ValueError('Normalization not possible with lag features. Please set initial_normalization=False')

scalar.fit(self.X[:self.train_index_end+1])
scalar.transform(self.X)

if initial_normalization:
return
else:
raise NotImplementedError('Normalization after lag features have been set not implemented yet')

# Idea:
# remove time dimension
# normalize features
# add time_dimension back
# Problem:
# usage of prep_lag_features needs to ensure y is not added a second time

def prep_lag_features(self,
lag_window: int=0,
include_y: bool=False,
pre_calc: bool=False
):
"""
Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
computation time later on at the expense of increases memory usage.
TODO: address the fact that this needs to be done direclty before calling getitem especially when include_y is true
"""
raise NotImplementedError('Not implemented yet')

def update_lag_features(self,
lag_window: int,
):

""" Update lag window parameters for dataloader object that is already initialized """

raise NotImplementedError('Not implemented yet')

# Problem: updating lag_features naively would shorten the dataset each time it is called

def __getitem__(self, index: int):

"""
get item by index, depending on the dataset type (train, val, test)
"""

if self.dataset_type == "train":
if index > self.train_index_end:
raise IndexError('Index out of bounds')

elif self.dataset_type == "val":
idx = idx + self.val_index_start

if idx >= self.test_index_start:
raise IndexError(f'index{idx} out of range{self.test_index_start}')

elif self.dataset_type == "test":
idx = idx + self.test_index_start

if idx >= len(self.X):
raise IndexError(f'index{idx} out of range{len(self.X)}')

elif self.dataset_type == "online":
if idx >= len(self.X):
raise IndexError(f'index{idx} out of range{len(self.X)}')

else:
raise ValueError('dataset_type not set')

return self.X[index], self._get_Y(index)

def _get_Y(self, index: int):

"""
Get Y function depending on the current index.
"""

if isinstance(self.alpha, np.ndarray):
alpha = self.alpha[index]
else:
alpha = self.alpha

if isinstance(self.beta, np.ndarray):
beta = self.beta[index]
else:
beta = self.beta

if isinstance(self.function_form, np.ndarray):
function_form = self.function_form[index]
else:
function_form = self.function_form

epsilon = self.epsilon[index]

if function_form == 'linear':
def linear(X, action):
demand = np.dot(alpha, X) + np.dot(beta, X) * action + epsilon
return np.maximum(demand, 0)

return linear
if function_form == 'log':
def log(X, action):
demand = np.divide(np.exp(np.dot(alpha, X) + np.dot(beta, X) * action), 1 + np.exp(np.dot(alpha, X) + np.dot(beta, X) * action)) + epsilon
return np.maximum(demand, 0)
return log
if function_form == 'exp':
def exp(X, action):
demand = np.exp(np.dot(alpha, X) + np.dot(beta, X) * action) + epsilon
return np.maximum(demand, 0)
return exp
if function_form == 'probit': # TODO: think about this more
return NotImplementedError('Probit not implemented yet')
def probit(X, action):
demand = norm.cdf(-np.dot(beta, X) * action) * np.dot(alpha, X) + epsilon
return np.maximum(demand, 0)
return probit

def __len__(self):
return len(self.X)

@property
def X_shape(self):
return self.X.shape

@property
def Y_shape(self):
return (self.epsilon.shape)

@property
def len_val(self):
if self.val_index_start is None:
raise ValueError('no validation set defined')
return self.test_index_start-self.val_index_start

@property
def len_test(self):
if self.test_index_start is None:
raise ValueError('no test set defined')
return len(self.Y)-self.test_index_start

def get_all_X(self,
dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all', 'online'
):

"""
Returns the entire features dataset.
Return either the train, val, test, or all data.
"""

if dataset_type == 'train':
return self.X[:self.val_index_start].copy() if self.X is not None else None
elif dataset_type == 'val':
return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None
elif dataset_type == 'test':
return self.X[self.test_index_start:].copy() if self.X is not None else None
elif dataset_type == 'all' or dataset_type == 'online':
return self.X.copy() if self.X is not None else None
else:
raise ValueError('dataset_type not recognized')

def get_all_Y(self,
dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
):

"""
Returns the entire target dataset.
Return either the train, val, test, or all data.
"""
if isinstance(self.alpha, np.ndarray):
if dataset_type == 'train':
alpha = self.alpha[:self.val_index_start].copy()
elif dataset_type == 'val':
alpha = self.alpha[self.val_index_start:self.test_index_start].copy()
elif dataset_type == 'test':
alpha = self.alpha[self.test_index_start:].copy()
elif dataset_type == 'all' or dataset_type == 'online':
alpha = self.alpha.copy()
else:
raise ValueError('dataset_type not recognized')

else:
alpha = self.alpha

if isinstance(self.beta, np.ndarray):
if dataset_type == 'train':
beta = self.beta[:self.val_index_start].copy()
elif dataset_type == 'val':
beta = self.beta[self.val_index_start:self.test_index_start].copy()
elif dataset_type == 'test':
beta = self.beta[self.test_index_start:].copy()
elif dataset_type == 'all' or dataset_type == 'online':
beta = self.beta.copy()
else:
raise ValueError('dataset_type not recognized')

else:
alpha = self.beta

if isinstance(self.function_form, np.ndarray):
if dataset_type == 'train':
function_form = self.function_form[:self.val_index_start].copy()
elif dataset_type == 'val':
function_form = self.function_form[self.val_index_start:self.test_index_start].copy()
elif dataset_type == 'test':
function_form = self.function_form[self.test_index_start:].copy()
elif dataset_type == 'all' or dataset_type == 'online':
function_form = self.function_form.copy()
else:
raise ValueError('dataset_type not recognized')

else:
function_form = self.function_form

if dataset_type == 'train':
epsilon = self.epsilon[:self.val_index_start].copy()
elif dataset_type == 'val':
epsilon = self.epsilon[self.val_index_start:self.test_index_start].copy()
elif dataset_type == 'test':
epsilon = self.epsilon[self.test_index_start:].copy()
elif dataset_type == 'all' or dataset_type == 'online':
epsilon = self.epsilon.copy()
else:
raise ValueError('dataset_type not recognized')

return alpha, beta, function_form, epsilon


Loading

0 comments on commit ab22103

Please sign in to comment.