From 6df2c44559b4abbf85b1ca10f31613372ebc7a9a Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Sun, 5 Nov 2017 12:42:43 -0500
Subject: [PATCH 01/12] making arff description concise

---
 pyradigm/pyradigm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index 111c48a..8be39ed 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -1106,7 +1106,8 @@ def __load_arff(self, arff_path, encode_nonnumeric=False):
             raise NotImplementedError('encoding non-numeric features to numeric is not implemented yet! '
                                       'Encode features beforing to ARFF.')
 
-        self.__description = 'ARFF relation {}\n read from {}'.format(arff_meta.name, arff_path)
+        # self.__description = 'ARFF relation {}\n read from {}'.format(arff_meta.name, arff_path)
+        self.__description = arff_meta.name # to enable it as a label e.g. in neuropredict
 
         # initializing the key containers, before calling self.add_sample
         self.__data = OrderedDict()

From a304c600f7d775e178d3f01fffd2ba55cd5b3433 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Tue, 14 Nov 2017 20:54:34 -0500
Subject: [PATCH 02/12] using only basic dict, instead of OrderedDict ..
 returning order may not reflect original order of addition

---
 pyradigm/pyradigm.py      | 75 +++++++++++++++++++++++++--------------
 pyradigm/test_pyradigm.py | 26 +++++++++++---
 2 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index 8be39ed..53aec85 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -9,8 +9,8 @@
 import argparse
 import traceback
 import logging
-from os.path import join as pjoin, exists as pexists, realpath, basename, dirname, isfile
-from collections import Counter, OrderedDict, Sequence
+from os.path import exists as pexists, realpath, basename, dirname, isfile
+from collections import Counter, Sequence
 from itertools import islice
 
 import numpy as np
@@ -97,10 +97,9 @@ def __init__(self, filepath=None,
                 raise ValueError('Dataset to copy is empty.')
             self.__copy(in_dataset)
         elif data is None and labels is None and classes is None:
-            # TODO refactor the code to use only basic dict, as it allows for better equality comparisons
-            self.__data = OrderedDict()
-            self.__labels = OrderedDict()
-            self.__classes = OrderedDict()
+            self.__data = dict()
+            self.__labels = dict()
+            self.__classes = dict()
             self.__num_features = 0
             self.__dtype = None
             self.__description = ''
@@ -111,9 +110,9 @@ def __init__(self, filepath=None,
             self.__validate(data, labels, classes)
 
             # OrderedDict to ensure the order is maintained when data/labels are returned in a matrix/array form
-            self.__data = OrderedDict(data)
-            self.__labels = OrderedDict(labels)
-            self.__classes = OrderedDict(classes)
+            self.__data = dict(data)
+            self.__labels = dict(labels)
+            self.__classes = dict(classes)
             self.__description = description
 
             sample_ids = list(data)
@@ -135,11 +134,19 @@ def data(self):
         """data in its original dict form."""
         return self.__data
 
-    def data_and_labels(self):
+    def data_and_labels(self, sorted_ids=False):
         """
         Dataset features and labels in a matrix form for learning.
 
-        Also returns sample_ids in the same order.
+        The row order in data matrix and target labels is identified by sample_ids,
+            which is not guaranteed upon different calls.
+
+        Parameters
+        ----------
+
+        sorted_ids : bool
+            Flag to request data and sample ids in sorted order.
+            This guarantees the same order upon different calls.
 
         Returns
         -------
@@ -153,6 +160,9 @@ def data_and_labels(self):
         """
 
         sample_ids = np.array(self.keys)
+        if sorted_ids:
+            sample_ids.sort()
+
         label_dict = self.labels
         matrix = np.full([self.num_samples, self.num_features], np.nan)
         labels = np.full([self.num_samples, 1], np.nan)
@@ -332,7 +342,6 @@ def check_features(self, features):
 
         return features
 
-    # TODO try implementing based on pandas
     def add_sample(self, sample_id, features, label,
                    class_id=None,
                    overwrite=False,
@@ -481,7 +490,7 @@ def get_class(self, class_id):
 
         Parameters
         ----------
-        class_id : str
+        class_id : str or list
             identifier of the class to be returned.
 
         Returns
@@ -803,13 +812,13 @@ class id to query.
         subset_ids = self.keys_with_value(self.classes, class_id)
         return subset_ids
 
-    def get_subset(self, subset_ids):
+    def get_subset(self, subset_ids_in):
         """
         Returns a smaller dataset identified by their keys/sample IDs.
 
         Parameters
         ----------
-        subset_ids : list
+        subset_ids_in : list
             List od sample IDs to extracted from the dataset.
 
         Returns
@@ -819,12 +828,10 @@ def get_subset(self, subset_ids):
 
         """
 
-        num_existing_keys = sum([1 for key in subset_ids if key in self.__data])
+        subset_ids = [key for key in subset_ids_in if key in self.__data]
+        num_existing_keys = len(subset_ids)
         if subset_ids is not None and num_existing_keys > 0:
-            # need to ensure data are added to data, labels etc in the same order of sample IDs
-            # TODO come up with a way to do this even when not using OrderedDict()
-            # putting the access of data, labels and classes in the same loop would ensure there is correspondence
-            # across the three attributes of the class
+
             data = self.__get_subset_from_dict(self.__data, subset_ids)
             labels = self.__get_subset_from_dict(self.__labels, subset_ids)
             if self.__classes is not None:
@@ -872,8 +879,24 @@ def __iter__(self):
 
     @staticmethod
     def __get_subset_from_dict(input_dict, subset):
-        # Using OrderedDict helps ensure data are added to data, labels etc in the same order of sample IDs
-        return OrderedDict((sid, value) for sid, value in input_dict.items() if sid in subset)
+        """
+        Returns dictionary formed from the subset of keys.
+
+        Parameters
+        ----------
+        input_dict : dict
+
+        subset : iterable
+            List of keys to be extracted.
+
+        Returns
+        -------
+        dict
+
+        """
+
+        # if sid in input_dict will avoid KeyError, although we are only passing keys that are already checked to exist
+        return {sid: input_dict[sid] for sid in subset if sid in input_dict}
 
     @property
     def keys(self):
@@ -1110,13 +1133,13 @@ def __load_arff(self, arff_path, encode_nonnumeric=False):
         self.__description = arff_meta.name # to enable it as a label e.g. in neuropredict
 
         # initializing the key containers, before calling self.add_sample
-        self.__data = OrderedDict()
-        self.__labels = OrderedDict()
-        self.__classes = OrderedDict()
+        self.__data = dict()
+        self.__labels = dict()
+        self.__classes = dict()
 
         num_samples = len(arff_data)
         num_digits = len(str(num_samples))
-        make_id = lambda index: 'row{index:0{nd}d}'.format(index=index,nd=num_digits)
+        make_id = lambda num_index: 'row{id:0{nd}d}'.format(id=num_index,nd=num_digits)
         sample_classes = [cls.decode('utf-8') for cls in arff_data['class']]
         class_set = set(sample_classes)
         label_dict = dict()
diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py
index b2c75b7..8f36a1b 100644
--- a/pyradigm/test_pyradigm.py
+++ b/pyradigm/test_pyradigm.py
@@ -1,5 +1,6 @@
 import os, sys
 import numpy as np
+import random
 from os.path import join as pjoin, exists as pexists, realpath, basename, dirname, isfile
 
 sys.dont_write_bytecode = True
@@ -31,7 +32,10 @@
 
 test_dataset = MLDataset()
 for class_index, class_id in enumerate(class_set):
-    for sub_ix in range(class_sizes[class_index]):
+    numeric_ids = list(range(class_sizes[class_index]))
+    # to ensure tests don't depend on the order of addition
+    random.shuffle(numeric_ids)
+    for sub_ix in numeric_ids:
         subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix)
         feat = np.random.random(num_features)
         test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names)
@@ -120,9 +124,23 @@ def test_invalid_constructor():
                       classes='invalid_value')
 
 def test_return_data_labels():
-    matrix, vec_labels, sub_ids = test_dataset.data_and_labels()
-    assert len(vec_labels)==len(sub_ids)
-    assert len(vec_labels)==matrix.shape[0]
+
+    matrix1, vec_labels1, sub_ids1 = test_dataset.data_and_labels()
+    assert len(vec_labels1)==len(sub_ids1)
+    assert len(vec_labels1)==matrix1.shape[0]
+
+
+def test_return_data_labels_sorted():
+    matrix1, vec_labels1, sub_ids1 = test_dataset.data_and_labels(sorted_ids=True)
+    assert len(vec_labels1)==len(sub_ids1)
+    assert len(vec_labels1)==matrix1.shape[0]
+
+    matrix2, vec_labels2, sub_ids2 = test_dataset.data_and_labels(sorted_ids=True)
+    assert np.all(vec_labels1==vec_labels2)
+    assert np.all(matrix1==matrix2)
+    assert np.all(sub_ids1==sub_ids2)
+    assert matrix1.shape == matrix2.shape
+
 
 def test_init_with_dict():
     new_ds = MLDataset(data=test_dataset.data, labels=test_dataset.labels, classes=test_dataset.classes)

From 89af9e17c49763b0371404fae1c3995767ace3d0 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Tue, 14 Nov 2017 21:06:14 -0500
Subject: [PATCH 03/12] informing key when data differs between two datasets

---
 pyradigm/pyradigm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index 53aec85..f3160ae 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -1299,7 +1299,7 @@ def __eq__(self, other):
         elif id(self.__data) != id(other.data):
             for key in self.keys:
                 if not np.all(self.data[key] == other.data[key]):
-                    print('differing data for the sample ids.')
+                    print('differing data for atleast one sample id : {}'.format(key))
                     return False
             return True
         else:

From bb7fb9ca87c5760c4099470ed4b44a139ccce7b7 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Tue, 14 Nov 2017 22:23:18 -0500
Subject: [PATCH 04/12] tighter management of data type

---
 pyradigm/pyradigm.py      | 19 +++++++++++--------
 pyradigm/test_pyradigm.py |  6 +++++-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index f3160ae..1aae75b 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -109,7 +109,6 @@ def __init__(self, filepath=None,
             # but only in data, labels and classes, not feature names
             self.__validate(data, labels, classes)
 
-            # OrderedDict to ensure the order is maintained when data/labels are returned in a matrix/array form
             self.__data = dict(data)
             self.__labels = dict(labels)
             self.__classes = dict(classes)
@@ -118,7 +117,7 @@ def __init__(self, filepath=None,
             sample_ids = list(data)
             features0 = data[sample_ids[0]]
             self.__num_features = features0.size if isinstance(features0,np.ndarray) else len(features0)
-            self.__dtype = type(data[sample_ids[0]])
+            self.__dtype = np.array(data[sample_ids[0]]).dtype
 
             # assigning default names for each feature
             if feature_names is None:
@@ -164,7 +163,7 @@ def data_and_labels(self, sorted_ids=False):
             sample_ids.sort()
 
         label_dict = self.labels
-        matrix = np.full([self.num_samples, self.num_features], np.nan)
+        matrix = np.empty([self.num_samples, self.num_features], dtype=self.dtype)
         labels = np.full([self.num_samples, 1], np.nan)
         for ix, sample in enumerate(sample_ids):
             matrix[ix, :] = self.__data[sample]
@@ -245,7 +244,7 @@ def feature_names(self):
 
     @feature_names.setter
     def feature_names(self, names):
-        "Stores the text labels for features"
+        """Stores the text labels for features"""
 
         if len(names) != self.num_features:
             raise ValueError("Number of names do not match the number of features!")
@@ -334,6 +333,10 @@ def check_features(self, features):
         if not isinstance(features, np.ndarray):
             features = np.asarray(features)
 
+        # # avoid numeric restrictions to enable pyradigm's utility for broader range of applications.
+        # if not np.issubdtype(features.dtype, np.number):
+        #     raise TypeError('non-numeric features are provided, which are not supported!')
+
         if features.size <= 0:
             raise ValueError('provided features are empty.')
 
@@ -394,7 +397,7 @@ def add_sample(self, sample_id, features, label,
             self.__data[sample_id] = features
             self.__labels[sample_id] = label
             self.__classes[sample_id] = class_id
-            self.__dtype = type(features)
+            self.__dtype = features.dtype
             self.__num_features = features.size if isinstance(features, np.ndarray) else len(features)
             if feature_names is None:
                 self.__feature_names = self.__str_names(self.num_features)
@@ -402,7 +405,7 @@ def add_sample(self, sample_id, features, label,
             if self.__num_features != features.size:
                 raise ValueError('dimensionality of this sample ({}) does not match existing samples ({})'.format(
                     features.size, self.__num_features))
-            if not isinstance(features, self.__dtype):
+            if features.dtype != self.__dtype:
                 raise TypeError("Mismatched dtype. Provide {}".format(self.__dtype))
 
             self.__data[sample_id] = features
@@ -939,8 +942,8 @@ def dtype(self):
     @dtype.setter
     def dtype(self, type_val):
         if self.__dtype is None:
-            if not isinstance(type_val, type):
-                raise TypeError('Invalid data type.')
+            if not isinstance(type_val, np.dtype):
+                raise TypeError('Invalid data type. It must be a valid numpy dtype!')
             self.__dtype = type_val
         else:
             warnings.warn('Data type is already inferred. Can not be set!')
diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py
index 8f36a1b..60dbd51 100644
--- a/pyradigm/test_pyradigm.py
+++ b/pyradigm/test_pyradigm.py
@@ -26,6 +26,7 @@
 num_classes  = np.random.randint( 2, 50)
 class_sizes  = np.random.randint(10, 1000, num_classes)
 num_features = np.random.randint(10, 500)
+data_type = 'float32'
 
 class_set    = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)])
 feat_names   = np.array([ str(x) for x in range(num_features) ])
@@ -37,7 +38,7 @@
     random.shuffle(numeric_ids)
     for sub_ix in numeric_ids:
         subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix)
-        feat = np.random.random(num_features)
+        feat = np.random.random(num_features).astype(data_type)
         test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names)
 
 out_file = os.path.join(out_dir,'random_example_dataset.pkl')
@@ -82,6 +83,9 @@ def test_num_classes():
 def test_num_features():
     assert test_dataset.num_features == num_features
 
+def test_dtype():
+    assert np.issubdtype(test_dataset.dtype, data_type)
+
 def test_num_features_setter():
     with raises(AttributeError):
         test_dataset.num_features = 0

From ea0a0e9ffd9fbadf5141331bf0768fbe177a43f3 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Tue, 14 Nov 2017 22:35:26 -0500
Subject: [PATCH 05/12] options to cast features to a different data type (e.g.
 to change precision to save memory/disk space)

---
 pyradigm/pyradigm.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index 1aae75b..49677f3 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -133,7 +133,7 @@ def data(self):
         """data in its original dict form."""
         return self.__data
 
-    def data_and_labels(self, sorted_ids=False):
+    def data_and_labels(self, out_data_type=None, sorted_ids=False):
         """
         Dataset features and labels in a matrix form for learning.
 
@@ -142,6 +142,8 @@ def data_and_labels(self, sorted_ids=False):
 
         Parameters
         ----------
+        out_data_type : numpy.dtype
+            Valid numpy dtype to cast the features to.
 
         sorted_ids : bool
             Flag to request data and sample ids in sorted order.
@@ -150,7 +152,7 @@ def data_and_labels(self, sorted_ids=False):
         Returns
         -------
         data_matrix : ndarray
-            2D array of shape [num_samples, num_features] with features  corresponding row-wise to sample_ids
+            2D array of shape [num_samples, num_features] with features corresponding row-wise to sample_ids
         labels : ndarray
             Array of numeric labels for each sample corresponding row-wise to sample_ids
         sample_ids : list
@@ -162,11 +164,16 @@ def data_and_labels(self, sorted_ids=False):
         if sorted_ids:
             sample_ids.sort()
 
+        if out_data_type is not None and isinstance(out_data_type, np.dtype):
+            out_data_type = out_data_type
+        else:
+            out_data_type = self.dtype
+
         label_dict = self.labels
-        matrix = np.empty([self.num_samples, self.num_features], dtype=self.dtype)
+        matrix = np.empty([self.num_samples, self.num_features], dtype=out_data_type)
         labels = np.full([self.num_samples, 1], np.nan)
         for ix, sample in enumerate(sample_ids):
-            matrix[ix, :] = self.__data[sample]
+            matrix[ix, :] = self.__data[sample].astype(out_data_type)
             labels[ix] = label_dict[sample]
 
         return matrix, np.ravel(labels), sample_ids

From b9fd0978c55fd5cf94c687bd972d53610ce765a8 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Tue, 21 Nov 2017 16:58:06 -0500
Subject: [PATCH 06/12] new flag to display extended info about the dataset

---
 pyradigm/pyradigm.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index 49677f3..dae13be 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -1329,14 +1329,14 @@ def cli_run():
 
     """
 
-    path_list, meta_requested, summary_requested, add_path_list, out_path = parse_args()
+    path_list, meta_requested, summary_requested, flag_extended, add_path_list, out_path = parse_args()
 
     # printing info if requested
     if path_list:
         for ds_path in path_list:
             ds = MLDataset(ds_path)
             if summary_requested:
-                print_info(ds, ds_path)
+                print_info(ds, ds_path, extended=flag_extended)
             if meta_requested:
                 print_meta(ds, ds_path)
 
@@ -1347,7 +1347,7 @@ def cli_run():
     return
 
 
-def print_info(ds, ds_path=None):
+def print_info(ds, ds_path=None, extended=False):
     "Prints basic summary of a given dataset."
 
     if ds_path is None:
@@ -1357,8 +1357,9 @@ def print_info(ds, ds_path=None):
 
     dashes = '-' * len(bname)
     print(bname)
-    print(dashes)
     print(ds)
+    if extended:
+        print('\nfeature names :\n{}'.format(ds.feature_names))
     print(dashes)
 
     return
@@ -1412,6 +1413,9 @@ def get_parser():
     parser.add_argument('-i', '--info', action='store_true', dest='summary_requested', required=False,
                         default=False, help='Prints summary info (classes, #samples, #features).')
 
+    parser.add_argument('-e', '--extended', action='store_true', dest='extended_summary_requested', required=False,
+                        default=False, help='Prints extended summary info (adding feature names etc).')
+
     arithmetic_group = parser.add_argument_group('Options for multiple datasets')
     arithmetic_group.add_argument('-a', '--add', nargs='+', action='store', dest='add_path_list', required=False,
                         default=None, help='List of MLDatasets to combine into a larger dataset.')
@@ -1469,11 +1473,18 @@ def parse_args():
         if len(add_path_list) < 2:
             raise ValueError('Need a minimum of datasets to combine!!')
 
+    extended_summary_requested = params.extended_summary_requested
+    summary_requested = params.summary_requested
+    if extended_summary_requested:
+        summary_requested = True
+
     # removing duplicates (from regex etc)
     path_list = set(path_list)
     add_path_list = set(add_path_list)
 
-    return path_list, params.meta_requested, params.summary_requested, add_path_list, out_path
+    return path_list, params.meta_requested, \
+           summary_requested, extended_summary_requested, \
+           add_path_list, out_path
 
 
 if __name__ == '__main__':

From e0622dffd75fbc041fc6d68ef855d5088d22447f Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Thu, 23 Nov 2017 23:06:23 -0500
Subject: [PATCH 07/12] fully randomizing the insertion order of samples and
 classes

---
 pyradigm/test_pyradigm.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py
index 60dbd51..b6a0602 100644
--- a/pyradigm/test_pyradigm.py
+++ b/pyradigm/test_pyradigm.py
@@ -28,18 +28,34 @@
 num_features = np.random.randint(10, 500)
 data_type = 'float32'
 
-class_set    = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)])
+class_set    = [ 'C{}'.format(x) for x in range(num_classes)  ]
 feat_names   = np.array([ str(x) for x in range(num_features) ])
 
+sample_ids = list()
+class_ids = list()
+num_labels = list()
+for class_index, cls_id in list(enumerate(class_set)):
+    ids_this_class = list([ '{}_S{}'.format(cls_id, sub_ix) for sub_ix in list(range(class_sizes[class_index]))])
+    sample_ids.extend(ids_this_class)
+    class_ids.extend([cls_id] * class_sizes[class_index])
+    num_labels.extend([class_index]*class_sizes[class_index])
+
+sample_ids = np.array(sample_ids)
+class_ids = np.array(class_ids)
+num_labels = np.array(num_labels)
+
+# to ensure tests don't depend on the order of sample/class addition
+shuffle_order = list(range(len(sample_ids)))
+random.shuffle(shuffle_order)
+
+sample_ids = sample_ids[shuffle_order]
+class_ids = class_ids[shuffle_order]
+num_labels = num_labels[shuffle_order]
+
 test_dataset = MLDataset()
-for class_index, class_id in enumerate(class_set):
-    numeric_ids = list(range(class_sizes[class_index]))
-    # to ensure tests don't depend on the order of addition
-    random.shuffle(numeric_ids)
-    for sub_ix in numeric_ids:
-        subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix)
-        feat = np.random.random(num_features).astype(data_type)
-        test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names)
+for ix, id in enumerate(sample_ids):
+    feat = np.random.random(num_features).astype(data_type)
+    test_dataset.add_sample(id, feat, num_labels[ix], class_ids[ix], feat_names)
 
 out_file = os.path.join(out_dir,'random_example_dataset.pkl')
 test_dataset.save(out_file)

From 6505d1da96bf1db15491202bad5b208f3117812c Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Thu, 23 Nov 2017 23:06:47 -0500
Subject: [PATCH 08/12] new option to return data matrix grouped by class
 membership

---
 pyradigm/pyradigm.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index dae13be..c44d925 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -133,7 +133,10 @@ def data(self):
         """data in its original dict form."""
         return self.__data
 
-    def data_and_labels(self, out_data_type=None, sorted_ids=False):
+    def data_and_labels(self,
+                        out_data_type=None,
+                        sorted_ids=False,
+                        group_by_class=False):
         """
         Dataset features and labels in a matrix form for learning.
 
@@ -149,6 +152,10 @@ def data_and_labels(self, out_data_type=None, sorted_ids=False):
             Flag to request data and sample ids in sorted order.
             This guarantees the same order upon different calls.
 
+        group_by_class : bool
+            Flag to help group the rows in the output matrix by class.
+            This helps to quickly visualize the patterns in data by class.
+
         Returns
         -------
         data_matrix : ndarray
@@ -160,9 +167,17 @@ def data_and_labels(self, out_data_type=None, sorted_ids=False):
 
         """
 
-        sample_ids = np.array(self.keys)
-        if sorted_ids:
-            sample_ids.sort()
+        if group_by_class:
+            sample_ids = list()
+            for cls in self.class_set:
+                ids_in_class = np.array(self.sample_ids_in_class(cls))
+                if sorted_ids:
+                    ids_in_class.sort()
+                sample_ids.extend(ids_in_class)
+        else:
+            sample_ids = np.array(self.keys)
+            if sorted_ids:
+                sample_ids.sort()
 
         if out_data_type is not None and isinstance(out_data_type, np.dtype):
             out_data_type = out_data_type

From 9f664f25f51c3ae236ac9f8c62c1795ad99dcbaf Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Fri, 24 Nov 2017 18:46:45 -0500
Subject: [PATCH 09/12] new constructor for arff, which makes it cleaner:
 MLDataset.arff(path)

---
 pyradigm/pyradigm.py      | 13 +++++++++++++
 pyradigm/test_pyradigm.py |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index c44d925..356f223 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -128,6 +128,18 @@ def __init__(self, filepath=None,
         else:
             raise ValueError('Incorrect way to construct the dataset.')
 
+    @classmethod
+    def arff(cls, arff_path, encode_nonnumeric=False):
+        "Constructor from ARFF file"
+
+        ds = cls.__new__(cls)
+        if isfile(arff_path):
+            ds = ds.__load_arff(arff_path, encode_nonnumeric)
+        else:
+            raise IOError('Given ARFF can not be found!')
+
+        return ds
+
     @property
     def data(self):
         """data in its original dict form."""
@@ -1184,6 +1196,7 @@ def __load_arff(self, arff_path, encode_nonnumeric=False):
         self.__feature_names = attr_names
 
         return
+        return self
 
     def save(self, file_path):
         """
diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py
index b6a0602..cc5af7d 100644
--- a/pyradigm/test_pyradigm.py
+++ b/pyradigm/test_pyradigm.py
@@ -331,7 +331,7 @@ def test_train_test_split_ids_perc():
 
 def test_load_arff():
     arff_path = realpath(pjoin(dirname(__file__),'../example_datasets/iris.arff'))
-    mld = MLDataset(arff_path=arff_path)
+    mld = MLDataset.arff(arff_path)
 
     if mld.num_samples != 150:
         raise ValueError('number of samples mismatch')

From 5a19da31b828c7f3a604f565a3c5577fcd68f7ac Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Fri, 24 Nov 2017 18:47:23 -0500
Subject: [PATCH 10/12] new copy constructor , which makes it cleaner:
 MLDataset.copy(ds)

---
 pyradigm/pyradigm.py      | 15 ++++++++++++++-
 pyradigm/test_pyradigm.py |  5 +++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index 356f223..d16e6a7 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -140,6 +140,20 @@ def arff(cls, arff_path, encode_nonnumeric=False):
 
         return ds
 
+    @classmethod
+    def copy(cls, in_dataset):
+        "Copy constructor"
+
+        if not isinstance(in_dataset, MLDataset):
+            raise ValueError('Invalid class input: MLDataset expected!')
+        if in_dataset.num_samples <= 0:
+            raise ValueError('Dataset to copy is empty.')
+
+        ds = cls.__new__(cls)
+        ds = ds.__copy(in_dataset)
+
+        return ds
+
     @property
     def data(self):
         """data in its original dict form."""
@@ -1195,7 +1209,6 @@ def __load_arff(self, arff_path, encode_nonnumeric=False):
 
         self.__feature_names = attr_names
 
-        return
         return self
 
     def save(self, file_path):
diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py
index cc5af7d..16096d2 100644
--- a/pyradigm/test_pyradigm.py
+++ b/pyradigm/test_pyradigm.py
@@ -242,6 +242,9 @@ def test_eq_copy():
     new_copy = MLDataset(in_dataset=copy_dataset)
     assert new_copy == copy_dataset
 
+    new_copy2 = MLDataset.copy(copy_dataset)
+    assert new_copy2 == copy_dataset
+
 def test_unpickling():
     out_file = os.path.join(out_dir, 'random_pickled_dataset.pkl')
     copy_dataset.save(out_file)
@@ -346,5 +349,3 @@ def test_load_arff():
         raise ValueError('length of feature names do not match number of features')
 
     # print(mld)
-
-test_load_arff()
\ No newline at end of file

From d35714dd15dd6e81aa716954d2d4cdd7e6b4c755 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Thu, 1 Mar 2018 21:46:21 -0500
Subject: [PATCH 11/12] more tests to ensure splits over repetitions are
 sufficiently random

---
 pyradigm/test_pyradigm.py | 79 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py
index 16096d2..47efa70 100644
--- a/pyradigm/test_pyradigm.py
+++ b/pyradigm/test_pyradigm.py
@@ -72,6 +72,54 @@
 
 copy_dataset = MLDataset(in_dataset=test_dataset)
 
+# ------------------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------------------
+
+feat_generator = np.random.randn
+
+def make_random_MLdataset(max_num_classes = 20,
+                          max_class_size = 50,
+                          max_dim = 100,
+                          stratified = True):
+    "Generates a random MLDataset for use in testing."
+
+    smallest = 10
+    max_class_size = max(smallest, max_class_size)
+    largest = max(50, max_class_size)
+    largest = max(smallest+3,largest)
+
+    num_classes = np.random.randint(2, max_num_classes, 1)
+    if type(num_classes) == np.ndarray:
+        num_classes = num_classes[0]
+    if not stratified:
+        class_sizes = np.random.random_integers(smallest, largest,
+                                                size=[num_classes, 1])
+    else:
+        class_sizes = np.repeat(np.random.randint(smallest, largest),
+                                                  num_classes)
+
+    num_features = np.random.randint(min(3, max_dim), max(3, max_dim), 1)[0]
+    feat_names = [ str(x) for x in range(num_features)]
+
+    class_ids = list()
+    labels = list()
+    for cl in range(num_classes):
+        class_ids.append('class-{}'.format(cl))
+        labels.append(int(cl))
+
+    ds = MLDataset()
+    for cc, class_ in enumerate(class_ids):
+        subids = [ 'sub{:03}-class{:03}'.format(ix,cc) for ix in range(class_sizes[cc]) ]
+        for sid in subids:
+            ds.add_sample(sid, feat_generator(num_features), int(cc), class_, feat_names)
+
+    return ds
+
+
+# ------------------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------------------
+
+
 rand_index = np.random.randint(0,len(class_set),1)[0]
 random_class_name = class_set[rand_index]
 random_class_ds = test_dataset.get_class(random_class_name)
@@ -328,6 +376,37 @@ def test_train_test_split_ids_perc():
     with raises(ValueError):
         copy_dataset.train_test_split_ids(train_perc=-1)
 
+
+def test_train_test_split_is_sufficiently_random():
+    """Test to ensure ids in repeated splits are sufficiently random"""
+
+    rand_ds = make_random_MLdataset(max_num_classes=10, max_class_size=1000, max_dim=1)
+
+    total_num_rep = 1000
+    for perc in np.arange(0.25, 1.0, 0.2):
+        accum_train = list()
+        accum_test = list()
+        for rep in range(total_num_rep):
+            cur_train, cur_test = rand_ds.train_test_split_ids(train_perc=perc)
+            accum_train.extend(cur_train)
+            accum_test.extend(cur_test)
+
+        ids_train, counts_train = np.unique(accum_train, return_counts=True)
+        ids_test , counts_test  = np.unique(accum_test , return_counts=True)
+
+        # if the splits were truly [sufficiently] random,
+        #  the counts for different ids must be similar
+        #  and close to equations below:
+        expected_count_train = total_num_rep * perc
+        expected_count_test  = total_num_rep * (1.0 - perc)
+        within_tol = lambda count, expd : np.isclose(np.mean(count), expd, rtol=0.05)
+
+        if not within_tol(counts_train, expected_count_train) or \
+                not within_tol(counts_test, expected_count_test):
+            raise ValueError('train/test splits ({}%) are NOT sufficiently random '
+                             'over {} repetitions'.format(100*perc,total_num_rep))
+
+
 # ------------------------------------------------
 # different file formats
 # ------------------------------------------------

From bbe06a442f14816034abb5f8a3f53af9dd3a8dab Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Fri, 2 Mar 2018 11:43:13 -0500
Subject: [PATCH 12/12] new method to enable renaming of classes and features

---
 pyradigm/pyradigm.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py
index d16e6a7..6dc8fe8 100644
--- a/pyradigm/pyradigm.py
+++ b/pyradigm/pyradigm.py
@@ -284,6 +284,23 @@ def classes(self, values):
         else:
             raise ValueError('classes input must be a dictionary!')
 
+    def rename(self, class_name_dict=None, feature_names=None):
+        """Method to remap class names, and rename features"""
+
+        if class_name_dict is not None and feature_names is None:
+            if isinstance(class_name_dict,dict):
+                # ensure atleast one class name remap exists
+                if not any([cls in class_name_dict for cls in self.class_set]):
+                    raise ValueError('Input mapping does not contain atleast one known class names')
+
+                for id in self.__classes:
+                    self.__classes[id] = class_name_dict[self.__classes[id]]
+            else:
+                raise ValueError('remap must be a dict with previous class names as keys and new class names as values!')
+        elif class_name_dict is None and feature_names is not None:
+            # validation happens in feature_names.setter
+            self.feature_names = feature_names
+
     @property
     def feature_names(self):
         "Returns the feature names as an numpy array of strings."
@@ -1114,6 +1131,7 @@ def __dir__():
                 'get_class',
                 'get_subset',
                 'random_subset',
+                'rename',
                 'get_feature_subset',
                 'keys',
                 'num_classes',