-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
64e2176
commit ab22103
Showing
6 changed files
with
1,658 additions
and
308 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,336 @@ | ||
"""Dataloaders for Online env""" | ||
|
||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/10_dataloaders/13_online_dataloaders.ipynb. | ||
|
||
# %% auto 0 | ||
__all__ = ['OnlineDataLoader', 'normalize_features', 'prep_lag_features', 'update_lag_features', 'X_shape', 'Y_shape', 'len_val', | ||
'len_test', 'get_all_X', 'get_all_Y'] | ||
|
||
# %% ../../nbs/10_dataloaders/13_online_dataloaders.ipynb 3 | ||
import logging | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
import numpy as np | ||
from abc import ABC, abstractmethod | ||
from typing import Union, Tuple, List, Literal | ||
import pandas as pd | ||
import math | ||
from scipy.stats import norm | ||
from .base import BaseDataLoader | ||
|
||
from sklearn.preprocessing import StandardScaler, MinMaxScaler | ||
|
||
# %% ../../nbs/10_dataloaders/13_online_dataloaders.ipynb 4 | ||
class OnlineDataLoader(BaseDataLoader): | ||
|
||
""" | ||
A class for online data depending on the action taken. | ||
X is an numpy array and Y is a function dependent on alpha, beta, function_form and epsilon. | ||
X may be of shape (datapoints, features) TODO: Lag features | ||
Y is of shape (datapoints, units) and takes as input (X, action). | ||
""" | ||
|
||
def __init__(self, | ||
X: np.ndarray, | ||
alpha: float | np.ndarray, | ||
beta: float | np.ndarray, | ||
epsilon: np.ndarray, | ||
function_form: str | np.ndarray = 'linear', | ||
val_index_start: int = None, | ||
test_index_start: int = None, | ||
normalize_features: dict = None, | ||
): | ||
self.X = X | ||
self.alpha = alpha | ||
self.beta = beta | ||
self.epsilon = epsilon | ||
|
||
self.function_form = function_form | ||
|
||
self.val_index_start = val_index_start | ||
self.test_index_start = test_index_start | ||
|
||
# train index ends either at the start of the validation set, the start of the test set or at the end of the dataset | ||
if self.val_index_start is not None: | ||
self.train_index_end = self.val_index_start-1 | ||
elif self.test_index_start is not None: | ||
self.train_index_end = self.test_index_start-1 | ||
else: | ||
self.train_index_end = len(Y)-1 | ||
|
||
self.dataset_type = "train" | ||
|
||
normalize_features = normalize_features or {'normalize': True, 'ignore_one_hot': True} | ||
|
||
self.normalize_features(**normalize_features, initial_normalization=True) | ||
|
||
# X must at least have datapoint and feature dimension | ||
if len(X.shape) == 1: | ||
self.X = X.reshape(-1, 1) | ||
|
||
if len(epsilon.shape) == 1: | ||
self.epsilon = epsilon.reshape(-1, 1) | ||
|
||
if isinstance(alpha, np.ndarray) and len(alpha.shape) == 1: | ||
self.alpha = alpha.reshape(-1, 1) | ||
|
||
if isinstance(beta, np.ndarray) and len(beta.shape) == 1: | ||
self.beta = beta.reshape(-1, 1) | ||
|
||
if isinstance(function_form, np.ndarray) and len(function_form.shape) == 1: | ||
self.function_form = function_form.reshape(-1, 1) | ||
|
||
if isinstance(self.alpha, np.ndarray) and isinstance(self.beta, np.ndarray) and isinstance(self.function_form, np.ndarray): | ||
assert self.alpha.shape[0] == self.beta.shape[0] == self.function_form.shape[0] == self.X.shape[0] == self.epsilon[0], "alpha, beta, X, epsilon and function_form must have the same length" | ||
else: | ||
assert self.X.shape[0] == self.epsilon.shape[0], "X and epsilon must have the same length" | ||
|
||
self.num_units = self.epsilon.shape[1] | ||
|
||
super().__init__() | ||
|
||
def normalize_features(self, | ||
normalize: bool = True, | ||
ignore_one_hot: bool = True, | ||
initial_normalization: bool = False | ||
): | ||
""" | ||
Normalize features using a standard scalar. If ignore_one_hot is True, one-hot encoded columns are not normalized. | ||
""" | ||
|
||
if normalize: | ||
|
||
scalar = StandardScaler() | ||
|
||
if initial_normalization: | ||
|
||
if len(self.X.shape) == 3: | ||
raise ValueError('Normalization not possible with lag features. Please set initial_normalization=False') | ||
|
||
scalar.fit(self.X[:self.train_index_end+1]) | ||
scalar.transform(self.X) | ||
|
||
if initial_normalization: | ||
return | ||
else: | ||
raise NotImplementedError('Normalization after lag features have been set not implemented yet') | ||
|
||
# Idea: | ||
# remove time dimension | ||
# normalize features | ||
# add time_dimension back | ||
# Problem: | ||
# usage of prep_lag_features needs to ensure y is not added a second time | ||
|
||
def prep_lag_features(self, | ||
lag_window: int=0, | ||
include_y: bool=False, | ||
pre_calc: bool=False | ||
): | ||
""" | ||
Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature. | ||
If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag | ||
window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features | ||
including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce | ||
computation time later on at the expense of increases memory usage. | ||
TODO: address the fact that this needs to be done direclty before calling getitem especially when include_y is true | ||
""" | ||
raise NotImplementedError('Not implemented yet') | ||
|
||
def update_lag_features(self, | ||
lag_window: int, | ||
): | ||
|
||
""" Update lag window parameters for dataloader object that is already initialized """ | ||
|
||
raise NotImplementedError('Not implemented yet') | ||
|
||
# Problem: updating lag_features naively would shorten the dataset each time it is called | ||
|
||
def __getitem__(self, index: int): | ||
|
||
""" | ||
get item by index, depending on the dataset type (train, val, test) | ||
""" | ||
|
||
if self.dataset_type == "train": | ||
if index > self.train_index_end: | ||
raise IndexError('Index out of bounds') | ||
|
||
elif self.dataset_type == "val": | ||
idx = idx + self.val_index_start | ||
|
||
if idx >= self.test_index_start: | ||
raise IndexError(f'index{idx} out of range{self.test_index_start}') | ||
|
||
elif self.dataset_type == "test": | ||
idx = idx + self.test_index_start | ||
|
||
if idx >= len(self.X): | ||
raise IndexError(f'index{idx} out of range{len(self.X)}') | ||
|
||
elif self.dataset_type == "online": | ||
if idx >= len(self.X): | ||
raise IndexError(f'index{idx} out of range{len(self.X)}') | ||
|
||
else: | ||
raise ValueError('dataset_type not set') | ||
|
||
return self.X[index], self._get_Y(index) | ||
|
||
def _get_Y(self, index: int): | ||
|
||
""" | ||
Get Y function depending on the current index. | ||
""" | ||
|
||
if isinstance(self.alpha, np.ndarray): | ||
alpha = self.alpha[index] | ||
else: | ||
alpha = self.alpha | ||
|
||
if isinstance(self.beta, np.ndarray): | ||
beta = self.beta[index] | ||
else: | ||
beta = self.beta | ||
|
||
if isinstance(self.function_form, np.ndarray): | ||
function_form = self.function_form[index] | ||
else: | ||
function_form = self.function_form | ||
|
||
epsilon = self.epsilon[index] | ||
|
||
if function_form == 'linear': | ||
def linear(X, action): | ||
demand = np.dot(alpha, X) + np.dot(beta, X) * action + epsilon | ||
return np.maximum(demand, 0) | ||
|
||
return linear | ||
if function_form == 'log': | ||
def log(X, action): | ||
demand = np.divide(np.exp(np.dot(alpha, X) + np.dot(beta, X) * action), 1 + np.exp(np.dot(alpha, X) + np.dot(beta, X) * action)) + epsilon | ||
return np.maximum(demand, 0) | ||
return log | ||
if function_form == 'exp': | ||
def exp(X, action): | ||
demand = np.exp(np.dot(alpha, X) + np.dot(beta, X) * action) + epsilon | ||
return np.maximum(demand, 0) | ||
return exp | ||
if function_form == 'probit': # TODO: think about this more | ||
return NotImplementedError('Probit not implemented yet') | ||
def probit(X, action): | ||
demand = norm.cdf(-np.dot(beta, X) * action) * np.dot(alpha, X) + epsilon | ||
return np.maximum(demand, 0) | ||
return probit | ||
|
||
def __len__(self): | ||
return len(self.X) | ||
|
||
@property | ||
def X_shape(self): | ||
return self.X.shape | ||
|
||
@property | ||
def Y_shape(self): | ||
return (self.epsilon.shape) | ||
|
||
@property | ||
def len_val(self): | ||
if self.val_index_start is None: | ||
raise ValueError('no validation set defined') | ||
return self.test_index_start-self.val_index_start | ||
|
||
@property | ||
def len_test(self): | ||
if self.test_index_start is None: | ||
raise ValueError('no test set defined') | ||
return len(self.Y)-self.test_index_start | ||
|
||
def get_all_X(self, | ||
dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all', 'online' | ||
): | ||
|
||
""" | ||
Returns the entire features dataset. | ||
Return either the train, val, test, or all data. | ||
""" | ||
|
||
if dataset_type == 'train': | ||
return self.X[:self.val_index_start].copy() if self.X is not None else None | ||
elif dataset_type == 'val': | ||
return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None | ||
elif dataset_type == 'test': | ||
return self.X[self.test_index_start:].copy() if self.X is not None else None | ||
elif dataset_type == 'all' or dataset_type == 'online': | ||
return self.X.copy() if self.X is not None else None | ||
else: | ||
raise ValueError('dataset_type not recognized') | ||
|
||
def get_all_Y(self, | ||
dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all' | ||
): | ||
|
||
""" | ||
Returns the entire target dataset. | ||
Return either the train, val, test, or all data. | ||
""" | ||
if isinstance(self.alpha, np.ndarray): | ||
if dataset_type == 'train': | ||
alpha = self.alpha[:self.val_index_start].copy() | ||
elif dataset_type == 'val': | ||
alpha = self.alpha[self.val_index_start:self.test_index_start].copy() | ||
elif dataset_type == 'test': | ||
alpha = self.alpha[self.test_index_start:].copy() | ||
elif dataset_type == 'all' or dataset_type == 'online': | ||
alpha = self.alpha.copy() | ||
else: | ||
raise ValueError('dataset_type not recognized') | ||
|
||
else: | ||
alpha = self.alpha | ||
|
||
if isinstance(self.beta, np.ndarray): | ||
if dataset_type == 'train': | ||
beta = self.beta[:self.val_index_start].copy() | ||
elif dataset_type == 'val': | ||
beta = self.beta[self.val_index_start:self.test_index_start].copy() | ||
elif dataset_type == 'test': | ||
beta = self.beta[self.test_index_start:].copy() | ||
elif dataset_type == 'all' or dataset_type == 'online': | ||
beta = self.beta.copy() | ||
else: | ||
raise ValueError('dataset_type not recognized') | ||
|
||
else: | ||
alpha = self.beta | ||
|
||
if isinstance(self.function_form, np.ndarray): | ||
if dataset_type == 'train': | ||
function_form = self.function_form[:self.val_index_start].copy() | ||
elif dataset_type == 'val': | ||
function_form = self.function_form[self.val_index_start:self.test_index_start].copy() | ||
elif dataset_type == 'test': | ||
function_form = self.function_form[self.test_index_start:].copy() | ||
elif dataset_type == 'all' or dataset_type == 'online': | ||
function_form = self.function_form.copy() | ||
else: | ||
raise ValueError('dataset_type not recognized') | ||
|
||
else: | ||
function_form = self.function_form | ||
|
||
if dataset_type == 'train': | ||
epsilon = self.epsilon[:self.val_index_start].copy() | ||
elif dataset_type == 'val': | ||
epsilon = self.epsilon[self.val_index_start:self.test_index_start].copy() | ||
elif dataset_type == 'test': | ||
epsilon = self.epsilon[self.test_index_start:].copy() | ||
elif dataset_type == 'all' or dataset_type == 'online': | ||
epsilon = self.epsilon.copy() | ||
else: | ||
raise ValueError('dataset_type not recognized') | ||
|
||
return alpha, beta, function_form, epsilon | ||
|
||
|
Oops, something went wrong.