now with using nbdev_prepare

d3group · Nov 26, 2024 · ab22103 · ab22103
1 parent 64e2176
commit ab22103
Show file tree

Hide file tree

Showing 6 changed files with 1,658 additions and 308 deletions.
diff --git a/ddopai/_modidx.py b/ddopai/_modidx.py
diff --git a/ddopai/dataloaders/online.py b/ddopai/dataloaders/online.py
@@ -0,0 +1,336 @@
+"""Dataloaders for Online env"""
+
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/10_dataloaders/13_online_dataloaders.ipynb.
+
+# %% auto 0
+__all__ = ['OnlineDataLoader', 'normalize_features', 'prep_lag_features', 'update_lag_features', 'X_shape', 'Y_shape', 'len_val',
+           'len_test', 'get_all_X', 'get_all_Y']
+
+# %% ../../nbs/10_dataloaders/13_online_dataloaders.ipynb 3
+import logging
+logging.basicConfig(level=logging.INFO)
+
+import numpy as np
+from abc import ABC, abstractmethod
+from typing import Union, Tuple, List, Literal
+import pandas as pd
+import math
+from scipy.stats import norm
+from .base import BaseDataLoader
+
+from sklearn.preprocessing import StandardScaler, MinMaxScaler 
+
+# %% ../../nbs/10_dataloaders/13_online_dataloaders.ipynb 4
+class OnlineDataLoader(BaseDataLoader):
+
+    """
+    A class for online data depending on the action taken. 
+    X is an numpy array and Y is a function dependent on alpha, beta, function_form and epsilon.
+    X may be of shape (datapoints, features) TODO: Lag features 
+    Y is of shape (datapoints, units) and takes as input (X, action).
+    """
+
+def __init__(self,
+    X: np.ndarray,
+    alpha: float | np.ndarray,
+    beta: float | np.ndarray,
+    epsilon: np.ndarray,
+    function_form: str | np.ndarray = 'linear',
+    val_index_start: int = None,
+    test_index_start: int = None,
+    normalize_features: dict = None,
+):
+    self.X = X
+    self.alpha = alpha
+    self.beta = beta
+    self.epsilon = epsilon
+
+    self.function_form = function_form
+
+    self.val_index_start = val_index_start
+    self.test_index_start = test_index_start
+
+    # train index ends either at the start of the validation set, the start of the test set or at the end of the dataset
+    if self.val_index_start is not None:
+            self.train_index_end = self.val_index_start-1
+    elif self.test_index_start is not None:
+            self.train_index_end = self.test_index_start-1
+    else:
+            self.train_index_end = len(Y)-1
+
+    self.dataset_type = "train"
+
+    normalize_features = normalize_features or {'normalize': True, 'ignore_one_hot': True}
+
+    self.normalize_features(**normalize_features, initial_normalization=True)    
+
+    # X must at least have datapoint and feature dimension
+    if len(X.shape) == 1:
+            self.X = X.reshape(-1, 1)
+
+    if len(epsilon.shape) == 1:
+            self.epsilon = epsilon.reshape(-1, 1)
+
+    if isinstance(alpha, np.ndarray) and len(alpha.shape) == 1:
+        self.alpha = alpha.reshape(-1, 1)
+
+    if isinstance(beta, np.ndarray) and len(beta.shape) == 1:
+        self.beta = beta.reshape(-1, 1)
+
+    if isinstance(function_form, np.ndarray) and len(function_form.shape) == 1:
+        self.function_form = function_form.reshape(-1, 1)
+
+    if isinstance(self.alpha, np.ndarray) and isinstance(self.beta, np.ndarray) and isinstance(self.function_form, np.ndarray):
+            assert self.alpha.shape[0] == self.beta.shape[0] == self.function_form.shape[0] == self.X.shape[0] == self.epsilon[0], "alpha, beta, X, epsilon and function_form must have the same length"
+    else:
+            assert self.X.shape[0] == self.epsilon.shape[0], "X and epsilon must have the same length"
+
+    self.num_units = self.epsilon.shape[1] 
+
+    super().__init__()
+
+def normalize_features(self,
+                       normalize: bool = True,
+                       ignore_one_hot: bool = True,
+                       initial_normalization: bool = False
+                       ):
+    """
+    Normalize features using a standard scalar. If ignore_one_hot is True, one-hot encoded columns are not normalized.
+    """
+
+    if normalize:
+
+        scalar = StandardScaler()
+
+        if initial_normalization:
+
+            if len(self.X.shape) == 3:
+                raise ValueError('Normalization not possible with lag features. Please set initial_normalization=False')
+
+            scalar.fit(self.X[:self.train_index_end+1])
+            scalar.transform(self.X)
+
+            if initial_normalization:
+                return
+            else:
+                    raise NotImplementedError('Normalization after lag features have been set not implemented yet')
+
+                    # Idea:
+                        # remove time dimension
+                        # normalize features
+                        # add time_dimension back
+                    # Problem:
+                        # usage of prep_lag_features needs to ensure y is not added a second time
+
+def prep_lag_features(self,
+                      lag_window: int=0,
+                      include_y: bool=False,
+                      pre_calc: bool=False
+                      ):
+    """
+        Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
+        If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
+        window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
+        including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
+        computation time later on at the expense of increases memory usage. 
+        TODO: address the fact that this needs to be done direclty before calling getitem especially when include_y is true
+    """
+    raise NotImplementedError('Not implemented yet')
+
+def update_lag_features(self,
+        lag_window: int,
+        ):
+
+        """ Update lag window parameters for dataloader object that is already initialized """
+
+        raise NotImplementedError('Not implemented yet')
+
+        # Problem: updating lag_features naively would shorten the dataset each time it is called
+
+def __getitem__(self, index: int):
+
+    """
+        get item by index, depending on the dataset type (train, val, test)
+    """
+
+    if self.dataset_type == "train":
+        if index > self.train_index_end:
+            raise IndexError('Index out of bounds')
+
+    elif self.dataset_type == "val":
+        idx = idx + self.val_index_start
+
+        if idx >= self.test_index_start:
+            raise IndexError(f'index{idx} out of range{self.test_index_start}')
+
+    elif self.dataset_type == "test":
+        idx = idx + self.test_index_start
+
+        if idx >= len(self.X):
+            raise IndexError(f'index{idx} out of range{len(self.X)}')
+
+    elif self.dataset_type == "online":
+        if idx >= len(self.X):
+            raise IndexError(f'index{idx} out of range{len(self.X)}')
+
+    else:
+        raise ValueError('dataset_type not set')
+
+    return self.X[index], self._get_Y(index)
+
+def _get_Y(self, index: int):
+
+        """
+            Get Y function depending on the current index. 
+        """
+
+        if isinstance(self.alpha, np.ndarray):
+            alpha = self.alpha[index]
+        else:
+            alpha = self.alpha
+
+        if isinstance(self.beta, np.ndarray):
+            beta = self.beta[index]
+        else:
+            beta = self.beta
+
+        if isinstance(self.function_form, np.ndarray):
+            function_form = self.function_form[index]
+        else:
+            function_form = self.function_form
+
+        epsilon = self.epsilon[index]
+
+        if function_form == 'linear':
+            def linear(X, action):
+                demand = np.dot(alpha, X) + np.dot(beta, X) * action + epsilon
+                return np.maximum(demand, 0)
+
+            return linear
+        if function_form == 'log':
+            def log(X, action):
+                demand = np.divide(np.exp(np.dot(alpha, X) + np.dot(beta, X) * action), 1 + np.exp(np.dot(alpha, X) + np.dot(beta, X) * action)) + epsilon
+                return np.maximum(demand, 0)
+            return log
+        if function_form == 'exp':
+            def exp(X, action):
+                demand = np.exp(np.dot(alpha, X) + np.dot(beta, X) * action) + epsilon
+                return np.maximum(demand, 0)
+            return exp
+        if function_form == 'probit': # TODO: think about this more 
+            return NotImplementedError('Probit not implemented yet')
+            def probit(X, action):
+                demand = norm.cdf(-np.dot(beta, X) * action) * np.dot(alpha, X) + epsilon
+                return np.maximum(demand, 0)
+            return probit
+
+def __len__(self):
+    return len(self.X)
+
+@property
+def X_shape(self):
+    return self.X.shape
+
+@property
+def Y_shape(self):
+    return (self.epsilon.shape)
+
+@property
+def len_val(self):
+    if self.val_index_start is None:
+        raise ValueError('no validation set defined')
+    return self.test_index_start-self.val_index_start
+
+@property
+def len_test(self):
+    if self.test_index_start is None:
+        raise ValueError('no test set defined')
+    return len(self.Y)-self.test_index_start
+
+def get_all_X(self,
+            dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all', 'online'
+            ): 
+
+    """
+    Returns the entire features dataset.
+    Return either the train, val, test, or all data.
+    """
+
+    if dataset_type == 'train':
+        return self.X[:self.val_index_start].copy() if self.X is not None else None
+    elif dataset_type == 'val':
+        return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None
+    elif dataset_type == 'test':
+        return self.X[self.test_index_start:].copy() if self.X is not None else None
+    elif dataset_type == 'all' or dataset_type == 'online':
+        return self.X.copy() if self.X is not None else None
+    else:
+        raise ValueError('dataset_type not recognized')
+
+def get_all_Y(self,
+                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
+                ): 
+
+        """
+        Returns the entire target dataset.
+        Return either the train, val, test, or all data.
+        """
+        if isinstance(self.alpha, np.ndarray):
+            if dataset_type == 'train':
+                alpha =  self.alpha[:self.val_index_start].copy()
+            elif dataset_type == 'val':
+                alpha =  self.alpha[self.val_index_start:self.test_index_start].copy()
+            elif dataset_type == 'test':
+                alpha =  self.alpha[self.test_index_start:].copy()
+            elif dataset_type == 'all' or dataset_type == 'online':
+                alpha =  self.alpha.copy() 
+            else:
+                raise ValueError('dataset_type not recognized')
+
+        else:
+            alpha = self.alpha
+
+        if isinstance(self.beta, np.ndarray):
+            if dataset_type == 'train':
+                beta =  self.beta[:self.val_index_start].copy()
+            elif dataset_type == 'val':
+                beta =  self.beta[self.val_index_start:self.test_index_start].copy()
+            elif dataset_type == 'test':
+                beta =  self.beta[self.test_index_start:].copy()
+            elif dataset_type == 'all' or dataset_type == 'online':
+                beta =  self.beta.copy() 
+            else:
+                raise ValueError('dataset_type not recognized')
+
+        else:
+            alpha = self.beta
+
+        if isinstance(self.function_form, np.ndarray):
+            if dataset_type == 'train':
+                function_form =  self.function_form[:self.val_index_start].copy()
+            elif dataset_type == 'val':
+                function_form =  self.function_form[self.val_index_start:self.test_index_start].copy()
+            elif dataset_type == 'test':
+                function_form =  self.function_form[self.test_index_start:].copy()
+            elif dataset_type == 'all' or dataset_type == 'online':
+                function_form =  self.function_form.copy() 
+            else:
+                raise ValueError('dataset_type not recognized')
+
+        else:
+            function_form = self.function_form
+
+        if dataset_type == 'train':
+            epsilon =  self.epsilon[:self.val_index_start].copy()
+        elif dataset_type == 'val':
+            epsilon =  self.epsilon[self.val_index_start:self.test_index_start].copy()
+        elif dataset_type == 'test':
+            epsilon =  self.epsilon[self.test_index_start:].copy()
+        elif dataset_type == 'all' or dataset_type == 'online':
+            epsilon =  self.epsilon.copy()
+        else:
+            raise ValueError('dataset_type not recognized')
+
+        return alpha, beta, function_form, epsilon
+
+