From 6df2c44559b4abbf85b1ca10f31613372ebc7a9a Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Sun, 5 Nov 2017 12:42:43 -0500 Subject: [PATCH 01/12] making arff description concise --- pyradigm/pyradigm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index 111c48a..8be39ed 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -1106,7 +1106,8 @@ def __load_arff(self, arff_path, encode_nonnumeric=False): raise NotImplementedError('encoding non-numeric features to numeric is not implemented yet! ' 'Encode features beforing to ARFF.') - self.__description = 'ARFF relation {}\n read from {}'.format(arff_meta.name, arff_path) + # self.__description = 'ARFF relation {}\n read from {}'.format(arff_meta.name, arff_path) + self.__description = arff_meta.name # to enable it as a label e.g. in neuropredict # initializing the key containers, before calling self.add_sample self.__data = OrderedDict() From a304c600f7d775e178d3f01fffd2ba55cd5b3433 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Tue, 14 Nov 2017 20:54:34 -0500 Subject: [PATCH 02/12] using only basic dict, instead of OrderedDict .. returning order may not reflect original order of addition --- pyradigm/pyradigm.py | 75 +++++++++++++++++++++++++-------------- pyradigm/test_pyradigm.py | 26 +++++++++++--- 2 files changed, 71 insertions(+), 30 deletions(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index 8be39ed..53aec85 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -9,8 +9,8 @@ import argparse import traceback import logging -from os.path import join as pjoin, exists as pexists, realpath, basename, dirname, isfile -from collections import Counter, OrderedDict, Sequence +from os.path import exists as pexists, realpath, basename, dirname, isfile +from collections import Counter, Sequence from itertools import islice import numpy as np @@ -97,10 +97,9 @@ def __init__(self, filepath=None, raise ValueError('Dataset to copy is empty.') self.__copy(in_dataset) elif data is None and labels is None and classes is None: - # TODO refactor the code to use only basic dict, as it allows for better equality comparisons - self.__data = OrderedDict() - self.__labels = OrderedDict() - self.__classes = OrderedDict() + self.__data = dict() + self.__labels = dict() + self.__classes = dict() self.__num_features = 0 self.__dtype = None self.__description = '' @@ -111,9 +110,9 @@ def __init__(self, filepath=None, self.__validate(data, labels, classes) # OrderedDict to ensure the order is maintained when data/labels are returned in a matrix/array form - self.__data = OrderedDict(data) - self.__labels = OrderedDict(labels) - self.__classes = OrderedDict(classes) + self.__data = dict(data) + self.__labels = dict(labels) + self.__classes = dict(classes) self.__description = description sample_ids = list(data) @@ -135,11 +134,19 @@ def data(self): """data in its original dict form.""" return self.__data - def data_and_labels(self): + def data_and_labels(self, sorted_ids=False): """ Dataset features and labels in a matrix form for learning. - Also returns sample_ids in the same order. + The row order in data matrix and target labels is identified by sample_ids, + which is not guaranteed upon different calls. + + Parameters + ---------- + + sorted_ids : bool + Flag to request data and sample ids in sorted order. + This guarantees the same order upon different calls. Returns ------- @@ -153,6 +160,9 @@ def data_and_labels(self): """ sample_ids = np.array(self.keys) + if sorted_ids: + sample_ids.sort() + label_dict = self.labels matrix = np.full([self.num_samples, self.num_features], np.nan) labels = np.full([self.num_samples, 1], np.nan) @@ -332,7 +342,6 @@ def check_features(self, features): return features - # TODO try implementing based on pandas def add_sample(self, sample_id, features, label, class_id=None, overwrite=False, @@ -481,7 +490,7 @@ def get_class(self, class_id): Parameters ---------- - class_id : str + class_id : str or list identifier of the class to be returned. Returns @@ -803,13 +812,13 @@ class id to query. subset_ids = self.keys_with_value(self.classes, class_id) return subset_ids - def get_subset(self, subset_ids): + def get_subset(self, subset_ids_in): """ Returns a smaller dataset identified by their keys/sample IDs. Parameters ---------- - subset_ids : list + subset_ids_in : list List od sample IDs to extracted from the dataset. Returns @@ -819,12 +828,10 @@ def get_subset(self, subset_ids): """ - num_existing_keys = sum([1 for key in subset_ids if key in self.__data]) + subset_ids = [key for key in subset_ids_in if key in self.__data] + num_existing_keys = len(subset_ids) if subset_ids is not None and num_existing_keys > 0: - # need to ensure data are added to data, labels etc in the same order of sample IDs - # TODO come up with a way to do this even when not using OrderedDict() - # putting the access of data, labels and classes in the same loop would ensure there is correspondence - # across the three attributes of the class + data = self.__get_subset_from_dict(self.__data, subset_ids) labels = self.__get_subset_from_dict(self.__labels, subset_ids) if self.__classes is not None: @@ -872,8 +879,24 @@ def __iter__(self): @staticmethod def __get_subset_from_dict(input_dict, subset): - # Using OrderedDict helps ensure data are added to data, labels etc in the same order of sample IDs - return OrderedDict((sid, value) for sid, value in input_dict.items() if sid in subset) + """ + Returns dictionary formed from the subset of keys. + + Parameters + ---------- + input_dict : dict + + subset : iterable + List of keys to be extracted. + + Returns + ------- + dict + + """ + + # if sid in input_dict will avoid KeyError, although we are only passing keys that are already checked to exist + return {sid: input_dict[sid] for sid in subset if sid in input_dict} @property def keys(self): @@ -1110,13 +1133,13 @@ def __load_arff(self, arff_path, encode_nonnumeric=False): self.__description = arff_meta.name # to enable it as a label e.g. in neuropredict # initializing the key containers, before calling self.add_sample - self.__data = OrderedDict() - self.__labels = OrderedDict() - self.__classes = OrderedDict() + self.__data = dict() + self.__labels = dict() + self.__classes = dict() num_samples = len(arff_data) num_digits = len(str(num_samples)) - make_id = lambda index: 'row{index:0{nd}d}'.format(index=index,nd=num_digits) + make_id = lambda num_index: 'row{id:0{nd}d}'.format(id=num_index,nd=num_digits) sample_classes = [cls.decode('utf-8') for cls in arff_data['class']] class_set = set(sample_classes) label_dict = dict() diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py index b2c75b7..8f36a1b 100644 --- a/pyradigm/test_pyradigm.py +++ b/pyradigm/test_pyradigm.py @@ -1,5 +1,6 @@ import os, sys import numpy as np +import random from os.path import join as pjoin, exists as pexists, realpath, basename, dirname, isfile sys.dont_write_bytecode = True @@ -31,7 +32,10 @@ test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): - for sub_ix in range(class_sizes[class_index]): + numeric_ids = list(range(class_sizes[class_index])) + # to ensure tests don't depend on the order of addition + random.shuffle(numeric_ids) + for sub_ix in numeric_ids: subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) @@ -120,9 +124,23 @@ def test_invalid_constructor(): classes='invalid_value') def test_return_data_labels(): - matrix, vec_labels, sub_ids = test_dataset.data_and_labels() - assert len(vec_labels)==len(sub_ids) - assert len(vec_labels)==matrix.shape[0] + + matrix1, vec_labels1, sub_ids1 = test_dataset.data_and_labels() + assert len(vec_labels1)==len(sub_ids1) + assert len(vec_labels1)==matrix1.shape[0] + + +def test_return_data_labels_sorted(): + matrix1, vec_labels1, sub_ids1 = test_dataset.data_and_labels(sorted_ids=True) + assert len(vec_labels1)==len(sub_ids1) + assert len(vec_labels1)==matrix1.shape[0] + + matrix2, vec_labels2, sub_ids2 = test_dataset.data_and_labels(sorted_ids=True) + assert np.all(vec_labels1==vec_labels2) + assert np.all(matrix1==matrix2) + assert np.all(sub_ids1==sub_ids2) + assert matrix1.shape == matrix2.shape + def test_init_with_dict(): new_ds = MLDataset(data=test_dataset.data, labels=test_dataset.labels, classes=test_dataset.classes) From 89af9e17c49763b0371404fae1c3995767ace3d0 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Tue, 14 Nov 2017 21:06:14 -0500 Subject: [PATCH 03/12] informing key when data differs between two datasets --- pyradigm/pyradigm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index 53aec85..f3160ae 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -1299,7 +1299,7 @@ def __eq__(self, other): elif id(self.__data) != id(other.data): for key in self.keys: if not np.all(self.data[key] == other.data[key]): - print('differing data for the sample ids.') + print('differing data for atleast one sample id : {}'.format(key)) return False return True else: From bb7fb9ca87c5760c4099470ed4b44a139ccce7b7 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Tue, 14 Nov 2017 22:23:18 -0500 Subject: [PATCH 04/12] tighter management of data type --- pyradigm/pyradigm.py | 19 +++++++++++-------- pyradigm/test_pyradigm.py | 6 +++++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index f3160ae..1aae75b 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -109,7 +109,6 @@ def __init__(self, filepath=None, # but only in data, labels and classes, not feature names self.__validate(data, labels, classes) - # OrderedDict to ensure the order is maintained when data/labels are returned in a matrix/array form self.__data = dict(data) self.__labels = dict(labels) self.__classes = dict(classes) @@ -118,7 +117,7 @@ def __init__(self, filepath=None, sample_ids = list(data) features0 = data[sample_ids[0]] self.__num_features = features0.size if isinstance(features0,np.ndarray) else len(features0) - self.__dtype = type(data[sample_ids[0]]) + self.__dtype = np.array(data[sample_ids[0]]).dtype # assigning default names for each feature if feature_names is None: @@ -164,7 +163,7 @@ def data_and_labels(self, sorted_ids=False): sample_ids.sort() label_dict = self.labels - matrix = np.full([self.num_samples, self.num_features], np.nan) + matrix = np.empty([self.num_samples, self.num_features], dtype=self.dtype) labels = np.full([self.num_samples, 1], np.nan) for ix, sample in enumerate(sample_ids): matrix[ix, :] = self.__data[sample] @@ -245,7 +244,7 @@ def feature_names(self): @feature_names.setter def feature_names(self, names): - "Stores the text labels for features" + """Stores the text labels for features""" if len(names) != self.num_features: raise ValueError("Number of names do not match the number of features!") @@ -334,6 +333,10 @@ def check_features(self, features): if not isinstance(features, np.ndarray): features = np.asarray(features) + # # avoid numeric restrictions to enable pyradigm's utility for broader range of applications. + # if not np.issubdtype(features.dtype, np.number): + # raise TypeError('non-numeric features are provided, which are not supported!') + if features.size <= 0: raise ValueError('provided features are empty.') @@ -394,7 +397,7 @@ def add_sample(self, sample_id, features, label, self.__data[sample_id] = features self.__labels[sample_id] = label self.__classes[sample_id] = class_id - self.__dtype = type(features) + self.__dtype = features.dtype self.__num_features = features.size if isinstance(features, np.ndarray) else len(features) if feature_names is None: self.__feature_names = self.__str_names(self.num_features) @@ -402,7 +405,7 @@ def add_sample(self, sample_id, features, label, if self.__num_features != features.size: raise ValueError('dimensionality of this sample ({}) does not match existing samples ({})'.format( features.size, self.__num_features)) - if not isinstance(features, self.__dtype): + if features.dtype != self.__dtype: raise TypeError("Mismatched dtype. Provide {}".format(self.__dtype)) self.__data[sample_id] = features @@ -939,8 +942,8 @@ def dtype(self): @dtype.setter def dtype(self, type_val): if self.__dtype is None: - if not isinstance(type_val, type): - raise TypeError('Invalid data type.') + if not isinstance(type_val, np.dtype): + raise TypeError('Invalid data type. It must be a valid numpy dtype!') self.__dtype = type_val else: warnings.warn('Data type is already inferred. Can not be set!') diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py index 8f36a1b..60dbd51 100644 --- a/pyradigm/test_pyradigm.py +++ b/pyradigm/test_pyradigm.py @@ -26,6 +26,7 @@ num_classes = np.random.randint( 2, 50) class_sizes = np.random.randint(10, 1000, num_classes) num_features = np.random.randint(10, 500) +data_type = 'float32' class_set = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([ str(x) for x in range(num_features) ]) @@ -37,7 +38,7 @@ random.shuffle(numeric_ids) for sub_ix in numeric_ids: subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix) - feat = np.random.random(num_features) + feat = np.random.random(num_features).astype(data_type) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir,'random_example_dataset.pkl') @@ -82,6 +83,9 @@ def test_num_classes(): def test_num_features(): assert test_dataset.num_features == num_features +def test_dtype(): + assert np.issubdtype(test_dataset.dtype, data_type) + def test_num_features_setter(): with raises(AttributeError): test_dataset.num_features = 0 From ea0a0e9ffd9fbadf5141331bf0768fbe177a43f3 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Tue, 14 Nov 2017 22:35:26 -0500 Subject: [PATCH 05/12] options to cast features to a different data type (e.g. to change precision to save memory/disk space) --- pyradigm/pyradigm.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index 1aae75b..49677f3 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -133,7 +133,7 @@ def data(self): """data in its original dict form.""" return self.__data - def data_and_labels(self, sorted_ids=False): + def data_and_labels(self, out_data_type=None, sorted_ids=False): """ Dataset features and labels in a matrix form for learning. @@ -142,6 +142,8 @@ def data_and_labels(self, sorted_ids=False): Parameters ---------- + out_data_type : numpy.dtype + Valid numpy dtype to cast the features to. sorted_ids : bool Flag to request data and sample ids in sorted order. @@ -150,7 +152,7 @@ def data_and_labels(self, sorted_ids=False): Returns ------- data_matrix : ndarray - 2D array of shape [num_samples, num_features] with features corresponding row-wise to sample_ids + 2D array of shape [num_samples, num_features] with features corresponding row-wise to sample_ids labels : ndarray Array of numeric labels for each sample corresponding row-wise to sample_ids sample_ids : list @@ -162,11 +164,16 @@ def data_and_labels(self, sorted_ids=False): if sorted_ids: sample_ids.sort() + if out_data_type is not None and isinstance(out_data_type, np.dtype): + out_data_type = out_data_type + else: + out_data_type = self.dtype + label_dict = self.labels - matrix = np.empty([self.num_samples, self.num_features], dtype=self.dtype) + matrix = np.empty([self.num_samples, self.num_features], dtype=out_data_type) labels = np.full([self.num_samples, 1], np.nan) for ix, sample in enumerate(sample_ids): - matrix[ix, :] = self.__data[sample] + matrix[ix, :] = self.__data[sample].astype(out_data_type) labels[ix] = label_dict[sample] return matrix, np.ravel(labels), sample_ids From b9fd0978c55fd5cf94c687bd972d53610ce765a8 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Tue, 21 Nov 2017 16:58:06 -0500 Subject: [PATCH 06/12] new flag to display extended info about the dataset --- pyradigm/pyradigm.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index 49677f3..dae13be 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -1329,14 +1329,14 @@ def cli_run(): """ - path_list, meta_requested, summary_requested, add_path_list, out_path = parse_args() + path_list, meta_requested, summary_requested, flag_extended, add_path_list, out_path = parse_args() # printing info if requested if path_list: for ds_path in path_list: ds = MLDataset(ds_path) if summary_requested: - print_info(ds, ds_path) + print_info(ds, ds_path, extended=flag_extended) if meta_requested: print_meta(ds, ds_path) @@ -1347,7 +1347,7 @@ def cli_run(): return -def print_info(ds, ds_path=None): +def print_info(ds, ds_path=None, extended=False): "Prints basic summary of a given dataset." if ds_path is None: @@ -1357,8 +1357,9 @@ def print_info(ds, ds_path=None): dashes = '-' * len(bname) print(bname) - print(dashes) print(ds) + if extended: + print('\nfeature names :\n{}'.format(ds.feature_names)) print(dashes) return @@ -1412,6 +1413,9 @@ def get_parser(): parser.add_argument('-i', '--info', action='store_true', dest='summary_requested', required=False, default=False, help='Prints summary info (classes, #samples, #features).') + parser.add_argument('-e', '--extended', action='store_true', dest='extended_summary_requested', required=False, + default=False, help='Prints extended summary info (adding feature names etc).') + arithmetic_group = parser.add_argument_group('Options for multiple datasets') arithmetic_group.add_argument('-a', '--add', nargs='+', action='store', dest='add_path_list', required=False, default=None, help='List of MLDatasets to combine into a larger dataset.') @@ -1469,11 +1473,18 @@ def parse_args(): if len(add_path_list) < 2: raise ValueError('Need a minimum of datasets to combine!!') + extended_summary_requested = params.extended_summary_requested + summary_requested = params.summary_requested + if extended_summary_requested: + summary_requested = True + # removing duplicates (from regex etc) path_list = set(path_list) add_path_list = set(add_path_list) - return path_list, params.meta_requested, params.summary_requested, add_path_list, out_path + return path_list, params.meta_requested, \ + summary_requested, extended_summary_requested, \ + add_path_list, out_path if __name__ == '__main__': From e0622dffd75fbc041fc6d68ef855d5088d22447f Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Thu, 23 Nov 2017 23:06:23 -0500 Subject: [PATCH 07/12] fully randomizing the insertion order of samples and classes --- pyradigm/test_pyradigm.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py index 60dbd51..b6a0602 100644 --- a/pyradigm/test_pyradigm.py +++ b/pyradigm/test_pyradigm.py @@ -28,18 +28,34 @@ num_features = np.random.randint(10, 500) data_type = 'float32' -class_set = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)]) +class_set = [ 'C{}'.format(x) for x in range(num_classes) ] feat_names = np.array([ str(x) for x in range(num_features) ]) +sample_ids = list() +class_ids = list() +num_labels = list() +for class_index, cls_id in list(enumerate(class_set)): + ids_this_class = list([ '{}_S{}'.format(cls_id, sub_ix) for sub_ix in list(range(class_sizes[class_index]))]) + sample_ids.extend(ids_this_class) + class_ids.extend([cls_id] * class_sizes[class_index]) + num_labels.extend([class_index]*class_sizes[class_index]) + +sample_ids = np.array(sample_ids) +class_ids = np.array(class_ids) +num_labels = np.array(num_labels) + +# to ensure tests don't depend on the order of sample/class addition +shuffle_order = list(range(len(sample_ids))) +random.shuffle(shuffle_order) + +sample_ids = sample_ids[shuffle_order] +class_ids = class_ids[shuffle_order] +num_labels = num_labels[shuffle_order] + test_dataset = MLDataset() -for class_index, class_id in enumerate(class_set): - numeric_ids = list(range(class_sizes[class_index])) - # to ensure tests don't depend on the order of addition - random.shuffle(numeric_ids) - for sub_ix in numeric_ids: - subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix) - feat = np.random.random(num_features).astype(data_type) - test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) +for ix, id in enumerate(sample_ids): + feat = np.random.random(num_features).astype(data_type) + test_dataset.add_sample(id, feat, num_labels[ix], class_ids[ix], feat_names) out_file = os.path.join(out_dir,'random_example_dataset.pkl') test_dataset.save(out_file) From 6505d1da96bf1db15491202bad5b208f3117812c Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Thu, 23 Nov 2017 23:06:47 -0500 Subject: [PATCH 08/12] new option to return data matrix grouped by class membership --- pyradigm/pyradigm.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index dae13be..c44d925 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -133,7 +133,10 @@ def data(self): """data in its original dict form.""" return self.__data - def data_and_labels(self, out_data_type=None, sorted_ids=False): + def data_and_labels(self, + out_data_type=None, + sorted_ids=False, + group_by_class=False): """ Dataset features and labels in a matrix form for learning. @@ -149,6 +152,10 @@ def data_and_labels(self, out_data_type=None, sorted_ids=False): Flag to request data and sample ids in sorted order. This guarantees the same order upon different calls. + group_by_class : bool + Flag to help group the rows in the output matrix by class. + This helps to quickly visualize the patterns in data by class. + Returns ------- data_matrix : ndarray @@ -160,9 +167,17 @@ def data_and_labels(self, out_data_type=None, sorted_ids=False): """ - sample_ids = np.array(self.keys) - if sorted_ids: - sample_ids.sort() + if group_by_class: + sample_ids = list() + for cls in self.class_set: + ids_in_class = np.array(self.sample_ids_in_class(cls)) + if sorted_ids: + ids_in_class.sort() + sample_ids.extend(ids_in_class) + else: + sample_ids = np.array(self.keys) + if sorted_ids: + sample_ids.sort() if out_data_type is not None and isinstance(out_data_type, np.dtype): out_data_type = out_data_type From 9f664f25f51c3ae236ac9f8c62c1795ad99dcbaf Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Fri, 24 Nov 2017 18:46:45 -0500 Subject: [PATCH 09/12] new constructor for arff, which makes it cleaner: MLDataset.arff(path) --- pyradigm/pyradigm.py | 13 +++++++++++++ pyradigm/test_pyradigm.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index c44d925..356f223 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -128,6 +128,18 @@ def __init__(self, filepath=None, else: raise ValueError('Incorrect way to construct the dataset.') + @classmethod + def arff(cls, arff_path, encode_nonnumeric=False): + "Constructor from ARFF file" + + ds = cls.__new__(cls) + if isfile(arff_path): + ds = ds.__load_arff(arff_path, encode_nonnumeric) + else: + raise IOError('Given ARFF can not be found!') + + return ds + @property def data(self): """data in its original dict form.""" @@ -1184,6 +1196,7 @@ def __load_arff(self, arff_path, encode_nonnumeric=False): self.__feature_names = attr_names return + return self def save(self, file_path): """ diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py index b6a0602..cc5af7d 100644 --- a/pyradigm/test_pyradigm.py +++ b/pyradigm/test_pyradigm.py @@ -331,7 +331,7 @@ def test_train_test_split_ids_perc(): def test_load_arff(): arff_path = realpath(pjoin(dirname(__file__),'../example_datasets/iris.arff')) - mld = MLDataset(arff_path=arff_path) + mld = MLDataset.arff(arff_path) if mld.num_samples != 150: raise ValueError('number of samples mismatch') From 5a19da31b828c7f3a604f565a3c5577fcd68f7ac Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Fri, 24 Nov 2017 18:47:23 -0500 Subject: [PATCH 10/12] new copy constructor , which makes it cleaner: MLDataset.copy(ds) --- pyradigm/pyradigm.py | 15 ++++++++++++++- pyradigm/test_pyradigm.py | 5 +++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index 356f223..d16e6a7 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -140,6 +140,20 @@ def arff(cls, arff_path, encode_nonnumeric=False): return ds + @classmethod + def copy(cls, in_dataset): + "Copy constructor" + + if not isinstance(in_dataset, MLDataset): + raise ValueError('Invalid class input: MLDataset expected!') + if in_dataset.num_samples <= 0: + raise ValueError('Dataset to copy is empty.') + + ds = cls.__new__(cls) + ds = ds.__copy(in_dataset) + + return ds + @property def data(self): """data in its original dict form.""" @@ -1195,7 +1209,6 @@ def __load_arff(self, arff_path, encode_nonnumeric=False): self.__feature_names = attr_names - return return self def save(self, file_path): diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py index cc5af7d..16096d2 100644 --- a/pyradigm/test_pyradigm.py +++ b/pyradigm/test_pyradigm.py @@ -242,6 +242,9 @@ def test_eq_copy(): new_copy = MLDataset(in_dataset=copy_dataset) assert new_copy == copy_dataset + new_copy2 = MLDataset.copy(copy_dataset) + assert new_copy2 == copy_dataset + def test_unpickling(): out_file = os.path.join(out_dir, 'random_pickled_dataset.pkl') copy_dataset.save(out_file) @@ -346,5 +349,3 @@ def test_load_arff(): raise ValueError('length of feature names do not match number of features') # print(mld) - -test_load_arff() \ No newline at end of file From d35714dd15dd6e81aa716954d2d4cdd7e6b4c755 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Thu, 1 Mar 2018 21:46:21 -0500 Subject: [PATCH 11/12] more tests to ensure splits over repetitions are sufficiently random --- pyradigm/test_pyradigm.py | 79 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/pyradigm/test_pyradigm.py b/pyradigm/test_pyradigm.py index 16096d2..47efa70 100644 --- a/pyradigm/test_pyradigm.py +++ b/pyradigm/test_pyradigm.py @@ -72,6 +72,54 @@ copy_dataset = MLDataset(in_dataset=test_dataset) +# ------------------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------------------ + +feat_generator = np.random.randn + +def make_random_MLdataset(max_num_classes = 20, + max_class_size = 50, + max_dim = 100, + stratified = True): + "Generates a random MLDataset for use in testing." + + smallest = 10 + max_class_size = max(smallest, max_class_size) + largest = max(50, max_class_size) + largest = max(smallest+3,largest) + + num_classes = np.random.randint(2, max_num_classes, 1) + if type(num_classes) == np.ndarray: + num_classes = num_classes[0] + if not stratified: + class_sizes = np.random.random_integers(smallest, largest, + size=[num_classes, 1]) + else: + class_sizes = np.repeat(np.random.randint(smallest, largest), + num_classes) + + num_features = np.random.randint(min(3, max_dim), max(3, max_dim), 1)[0] + feat_names = [ str(x) for x in range(num_features)] + + class_ids = list() + labels = list() + for cl in range(num_classes): + class_ids.append('class-{}'.format(cl)) + labels.append(int(cl)) + + ds = MLDataset() + for cc, class_ in enumerate(class_ids): + subids = [ 'sub{:03}-class{:03}'.format(ix,cc) for ix in range(class_sizes[cc]) ] + for sid in subids: + ds.add_sample(sid, feat_generator(num_features), int(cc), class_, feat_names) + + return ds + + +# ------------------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------------------ + + rand_index = np.random.randint(0,len(class_set),1)[0] random_class_name = class_set[rand_index] random_class_ds = test_dataset.get_class(random_class_name) @@ -328,6 +376,37 @@ def test_train_test_split_ids_perc(): with raises(ValueError): copy_dataset.train_test_split_ids(train_perc=-1) + +def test_train_test_split_is_sufficiently_random(): + """Test to ensure ids in repeated splits are sufficiently random""" + + rand_ds = make_random_MLdataset(max_num_classes=10, max_class_size=1000, max_dim=1) + + total_num_rep = 1000 + for perc in np.arange(0.25, 1.0, 0.2): + accum_train = list() + accum_test = list() + for rep in range(total_num_rep): + cur_train, cur_test = rand_ds.train_test_split_ids(train_perc=perc) + accum_train.extend(cur_train) + accum_test.extend(cur_test) + + ids_train, counts_train = np.unique(accum_train, return_counts=True) + ids_test , counts_test = np.unique(accum_test , return_counts=True) + + # if the splits were truly [sufficiently] random, + # the counts for different ids must be similar + # and close to equations below: + expected_count_train = total_num_rep * perc + expected_count_test = total_num_rep * (1.0 - perc) + within_tol = lambda count, expd : np.isclose(np.mean(count), expd, rtol=0.05) + + if not within_tol(counts_train, expected_count_train) or \ + not within_tol(counts_test, expected_count_test): + raise ValueError('train/test splits ({}%) are NOT sufficiently random ' + 'over {} repetitions'.format(100*perc,total_num_rep)) + + # ------------------------------------------------ # different file formats # ------------------------------------------------ From bbe06a442f14816034abb5f8a3f53af9dd3a8dab Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Fri, 2 Mar 2018 11:43:13 -0500 Subject: [PATCH 12/12] new method to enable renaming of classes and features --- pyradigm/pyradigm.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pyradigm/pyradigm.py b/pyradigm/pyradigm.py index d16e6a7..6dc8fe8 100644 --- a/pyradigm/pyradigm.py +++ b/pyradigm/pyradigm.py @@ -284,6 +284,23 @@ def classes(self, values): else: raise ValueError('classes input must be a dictionary!') + def rename(self, class_name_dict=None, feature_names=None): + """Method to remap class names, and rename features""" + + if class_name_dict is not None and feature_names is None: + if isinstance(class_name_dict,dict): + # ensure atleast one class name remap exists + if not any([cls in class_name_dict for cls in self.class_set]): + raise ValueError('Input mapping does not contain atleast one known class names') + + for id in self.__classes: + self.__classes[id] = class_name_dict[self.__classes[id]] + else: + raise ValueError('remap must be a dict with previous class names as keys and new class names as values!') + elif class_name_dict is None and feature_names is not None: + # validation happens in feature_names.setter + self.feature_names = feature_names + @property def feature_names(self): "Returns the feature names as an numpy array of strings." @@ -1114,6 +1131,7 @@ def __dir__(): 'get_class', 'get_subset', 'random_subset', + 'rename', 'get_feature_subset', 'keys', 'num_classes',