Parametrice report of sum of matrices. Closes #29. Related #35

ayrna · Jan 31, 2018 · c59baba · c59baba
1 parent fe91667
commit c59baba
Show file tree

Hide file tree

Showing 21 changed files with 229 additions and 259 deletions.
diff --git a/doc/orca-tutorial.md b/doc/orca-tutorial.md
@@ -228,6 +228,7 @@ ORCA uses the `Experiments` folder to store all the results of the different exp
         - Optimal hyper-parameters values obtained after nested cross-validation ('OptHyperparams').
         - Computational time results ('Times').
 
+If you provide the option `report_sum = true` in `{general-conf}`, additionally the same metrics will be calculated with a matrix that is the sum of the generalization matrices (as Weka does). **Note that this only makes sense in the case of a k-fold experimental design**. With this option active, two additional reports will be generated (`mean-results_matrices_sum_train.csv` and `mean-results_matrices_sum_test.csv`)
 
 ## Running algorithms with ORCA API
 

diff --git a/src/Config.m b/src/Config.m
@@ -80,22 +80,6 @@
 
                 obj.exps{i}=eObj;
             end
-
-
-            %             % TODO
-            %             if Config.validateconfig(exps)
-            %                 return
-            %             else
-            %                 error('Invalid INI file %s', confFile)
-            %             end
-        end
-    end
-
-    methods(Static = true, Access = private)
-
-        function valid = validateconfig(exps)
-            %VALIDATECONFIG Validates set of experiments
-            %TODO         
         end
     end
 end

diff --git a/src/Experiment.m b/src/Experiment.m
@@ -28,6 +28,9 @@
         cvCriteria = MAE;
         crossvalide = 0;
         resultsDir = '';
+        % calculate metrics with the sum of matrices (only suitable for 
+        % k-fold experimental design)
+        report_sum = 0; 
         seed = 1;
         parameters; % parameters to optimize
     end
@@ -81,7 +84,6 @@
             % Copy ini values to corresponding object properties
 
             % General experiment properties
-            % TODO: check robustness and document behaviour of ini file
             if expObj.general.isKey('num_folds')
                 obj.data.nOfFolds = str2num(expObj.general('num_folds'));
             end
@@ -95,6 +97,9 @@
             if expObj.general.isKey('seed')
                 obj.seed = str2num(expObj.general('seed'));
             end
+            if expObj.general.isKey('report_sum')
+                obj.report_sum = str2num(expObj.general('report_sum'));
+            end
 
             try
                 obj.data.directory = expObj.general('directory');

diff --git a/src/Utilities.m b/src/Utilities.m
@@ -96,9 +96,10 @@
 
             disp('Calculating results...');
             % Train results (note last argument)
-            Utilities.results([logsDir '/' 'Results'],1);
+
+            Utilities.results([logsDir '/' 'Results'],'report_sum', myExperiment.report_sum, 'train', true);
             % Test results
-            Utilities.results([logsDir '/' 'Results']);
+            Utilities.results([logsDir '/' 'Results'], 'report_sum', myExperiment.report_sum);
             %rmpath('Measures');
             %rmpath('Algorithms');
 
@@ -116,7 +117,7 @@ function octaveParallelAuxFunction(experimentToRun,logsDir)
             end
         end
 
-        function results(experiment_folder,train)
+        function results(experiment_folder,varargin)
             % RESULTS Function for computing the results
             %   RESULTS(EXPERIMENT_FOLDER) computes results of predictions
             %   stored in EXPERIMENT_FOLDER. It generates CSV files with
@@ -131,7 +132,7 @@ function results(experiment_folder,train)
             %       confussion matrices of the _k_ experiments (as Weka does). Each column
             %       presents the performance of this single matrix.
             %
-            %   RESULTS(EXPERIMENT_FOLDER,TRAIN) same as
+            %   RESULTS(EXPERIMENT_FOLDER,'TRAIN', true) same as
             %   RESULTS(EXPERIMENT_FOLDER) but calculates performance in train
             %   data. It can be usefull to evaluate overfitting.
             %
@@ -142,24 +143,21 @@ function results(experiment_folder,train)
             addpath(fullfile(fileparts(which('Utilities.m')),'Measures'));
             addpath(fullfile(fileparts(which('Utilities.m')),'Algorithms'));
 
-            if nargin < 2
-                train = 0;
-            elseif nargin == 1
-                train = train;
-            end
-            experiments = dir(experiment_folder);
-
-            %idx=strfind(experiment_folder,'Results');
-            %scriptpath = [experiment_folder(1:idx-1)];
+            opt.train = false;
+            opt.report_sum = false;
 
+            opt = Utilities.parseVarArgs(opt, varargin);
+
+            experiments = dir(experiment_folder);
+
             for i=1:numel(experiments)
                 if ~(any(strcmp(experiments(i).name, {'.', '..'}))) && experiments(i).isdir
                     disp([experiment_folder '/' experiments(i).name '/' 'dataset'])
                     fid = fopen([experiment_folder '/' experiments(i).name '/' 'dataset'],'r');
                     datasetPath = fgetl(fid);
                     fclose(fid);
 
-                    if train == 1
+                    if opt.train
                         predicted_files = dir([experiment_folder '/' experiments(i).name '/' 'Predictions' '/' 'train_*']);
                     else
                         predicted_files = dir([experiment_folder '/' experiments(i).name '/' 'Predictions' '/' 'test_*']);
@@ -175,27 +173,19 @@ function results(experiment_folder,train)
                     time_files = dir([experiment_folder '/' experiments(i).name '/' 'Times' '/' '*.*']);
                     hyp_files = dir([experiment_folder '/' experiments(i).name '/' 'OptHyperparams' '/' '*.*']);
 
-                    if train == 1
+                    if opt.train
                         guess_files = dir([experiment_folder '/' experiments(i).name '/' 'Guess' '/' 'train_*']);
                     else
                         guess_files = dir([experiment_folder '/' experiments(i).name '/' 'Guess' '/' 'test_*']);
                     end
 
-                    %str=predicted_files(1).name;
-                    %[matchstart,matchend] = regexp( str,'_(.+)\.\d+');
-                    %dataset=str(matchstart+1:matchend-2);
-
-                    %auxscript =  experimentos(i).name;
-                    %[matchstart,matchend]=regexp(auxscript,dataset);
-                    %basescript = ['exp-' auxscript(matchend+2:end) '-' dataset '-'];
-
                     % Discard "." and ".."
                     if ~(exist ('OCTAVE_VERSION', 'builtin') > 0)
                         time_files = time_files(3:numel(time_files));
                         hyp_files = hyp_files(3:numel(hyp_files));
                     end
 
-                    if train == 1
+                    if opt.train
                         real_files = dir([datasetPath '/' 'train_*']);
                     else
                         real_files = dir([datasetPath '/' 'test_*']);
@@ -263,7 +253,7 @@ function results(experiment_folder,train)
                     results_matrix = results_matrix';
 
                     % Results for the independent dataset
-                    if train == 1
+                    if opt.train
                         fid = fopen([experiment_folder '/' experiments(i).name '/' 'results_train.csv'],'w');
                     else
                         fid = fopen([experiment_folder '/' experiments(i).name '/' 'results_test.csv'],'w');
@@ -284,9 +274,8 @@ function results(experiment_folder,train)
                     fclose(fid);
 
                     % Confusion matrices and sum of confusion matrices
-                    % TODO PARAMETRIZAR
-                    if false
-                         if train == 1
+                    if opt.report_sum
+                         if opt.train
                              fid = fopen([experiment_folder '/' experiments(i).name '/' 'matrices_train.txt'],'w');
                          else
                              fid = fopen([experiment_folder '/' experiments(i).name '/' 'matrices_test.txt'],'w');
@@ -326,7 +315,7 @@ function results(experiment_folder,train)
                     means = mean(results_matrix,1);
                     stdev = std(results_matrix,0,1);
 
-                    if train == 1
+                    if opt.train
                         if ~exist([experiment_folder '/' 'mean-results_train.csv'],'file')
                             add_head = 1;
                         else
@@ -363,9 +352,8 @@ function results(experiment_folder,train)
 
 
                     % Confusion matrices and sum of confusion matrices
-                    % TODO PARAMETRIZAR
-                    if false
-                         if train == 1
+                    if opt.report_sum
+                         if opt.train
                              fid = fopen([experiment_folder '/' 'mean-results_matrices_sum_train.csv'],'at');
                          else
                              fid = fopen([experiment_folder '/' 'mean-results_matrices_sum_test.csv'],'at');
@@ -503,22 +491,6 @@ function runExperimentFold(confFile)
             dbs(1) = [];
             validDataSets = 1;
 
-            % Currently, 'all' is not working
-            %if strcmpi(dataSetNames{1}, 'all')
-            %    trainFileNames = cell(size(dbs,1),1);
-            %    testFileNames = cell(size(dbs,1),1);
-            %    for dd=1:size(dbs,1)
-            %        % get directory
-            %        if dbs(dd).isdir,
-            %            ejemplo = [directory '/' dbs(dd).name '/' 'matlab' '/' 'train_' dbs(dd).name '.*'];
-            %            trainFileNames{validDataSets} = dir(ejemplo);
-            %            ejemplo = [directory '/' dbs(dd).name '/' 'matlab' '/' 'test_' dbs(dd).name '.*'];
-            %            testFileNames{validDataSets} = dir(ejemplo);
-            %            validDataSets = validDataSets + 1;
-            %        end
-            %
-            %    end
-            %else
             trainFileNames = cell(numel(dataSetNames),1);
             testFileNames = cell(numel(dataSetNames),1);
             for j=1:numel(dataSetNames)
@@ -531,7 +503,6 @@ function runExperimentFold(confFile)
                     validDataSets = validDataSets + 1;
                 end
             end
-            %end
         end
 
         function checkDatasets(basedir, datasets)
@@ -647,39 +618,48 @@ function closePool()
             % - 'closepool': whether to close or not the pool after 
             %    experiments. Default 'true'
             % Solution adapted from https://stackoverflow.com/questions/2775263/how-to-deal-with-name-value-pairs-of-function-arguments-in-matlab#2776238
+
             if (exist ('OCTAVE_VERSION', 'builtin') > 0)
                 maximum_ncores = nproc;
             else
                 maximum_ncores = feature('numCores');
             end
 
-            varargin = varargin{1};
-
             options = struct('parallel',false,'numcores',maximum_ncores,'closepool',true);
 
-            %# read the acceptable names
-            optionNames = fieldnames(options);
-
-            %# count arguments
-            nArgs = length(varargin);
-            if mod(nArgs,2)
-                error('parseParArgs needs propertyName/propertyValue pairs')
-            end
-
-            for pair = reshape(varargin,2,[]) %# pair is {propName;propValue}
-                inpName = lower(pair{1}); %# make case insensitive
-
-                if any(strcmp(inpName,optionNames))
-                    %# overwrite options.
-                    options.(inpName) = pair{2};
-                else
-                    error('%s is not a recognized parameter name',inpName)
+            varargin = varargin{:};
+            if ~isempty(varargin)
+                options = Utilities.parseVarArgs(options, varargin);
+                if options.parallel && options.numcores <2
+                    disp('Number of cores to low, setting to default number of cores')
+                    options.numcores = maximum_ncores;
                 end
             end
-
-            if options.parallel && options.numcores <2
-                disp('Number of cores to low, setting to default number of cores')
-                options.numcores = maximum_ncores;
+        end
+
+        function options = parseVarArgs(options, varargin)
+            if ~isempty(varargin{:})
+                par = varargin{:};
+
+                % read the acceptable names
+                optionNames = fieldnames(options);
+
+                % count arguments
+                nArgs = length(par);
+                if mod(nArgs,2)
+                    error('parseVarArgs needs propertyName/propertyValue pairs')
+                end
+
+                for pair = reshape(par,2,[]) % pair is {propName;propValue}
+                    inpName = lower(pair{1}); % make case insensitive
+
+                    if any(strcmp(inpName,optionNames))
+                        % overwrite options.
+                        options.(inpName) = pair{2};
+                    else
+                        error('%s is not a recognized parameter name',inpName)
+                    end
+                end
             end
         end
     end

diff --git a/src/config-files/cssvc.ini b/src/config-files/cssvc.ini
@@ -8,9 +8,9 @@
 {general-conf}
 seed = 1
 ; Datasets path
-basedir = ../../../datasets/ordinal/real/30-holdout
+basedir = ../../../datasets/ordinal-regression
 ; Datasets to process (comma separated list or all to process all)
-datasets = automobile,balance-scale,bondrate,car,contact-lenses,ERA,ESL,eucalyptus,LEV,marketing,newthyroid,pasture,squash-stored,squash-unstored,SWD,tae,thyroid,toy,winequality-red,winequality-white
+datasets = all
 ; Activate data standardization
 standarize = true
 ; Number of folds for the parameters optimization
@@ -32,8 +32,8 @@ k = 10.^(-3:1:3)
 [cssvc-mae-regression5]
 {general-conf}
 seed = 1
-basedir = ../../../datasets/ordinal/regression/5bins
-datasets = abalone,housing,machine,pyrim,stock,bank1-5,bank2-5,calhousing-5,census1-5,census2-5,computer1-5,computer2-5
+basedir = ../../../datasets/discretized-regression/5bins
+datasets = all
 standarize = true
 num_folds = 5
 cvmetric = mae
@@ -50,8 +50,8 @@ k = 10.^(-3:1:3)
 [cssvc-mae-regression10]
 {general-conf}
 seed = 1
-basedir = ../../../datasets/ordinal/regression/10bins
-datasets = abalone10,housing10,machine10,pyrim10,stock10bank1-10,bank2-10,calhousing-10,census1-10,census2-10,computer1-10,computer2-10
+basedir = ../../../datasets/discretized-regression/10bins
+datasets = all
 standarize = true
 num_folds = 5
 cvmetric = mae
@@ -67,8 +67,8 @@ k = 10.^(-3:1:3)
 [cssvc-mze-real]
 {general-conf}
 seed = 1
-basedir = ../../../datasets/ordinal/real/30-holdout
-datasets = automobile,balance-scale,bondrate,car,contact-lenses,ERA,ESL,eucalyptus,LEV,marketing,newthyroid,pasture,squash-stored,squash-unstored,SWD,tae,thyroid,toy,winequality-red,winequality-white
+basedir = ../../../datasets/ordinal-regression
+datasets = all
 standarize = true
 num_folds = 5
 cvmetric = mze
@@ -84,8 +84,8 @@ k = 10.^(-3:1:3)
 [cssvc-mze-regression5]
 {general-conf}
 seed = 1
-basedir = ../../../datasets/ordinal/regression/5bins
-datasets = abalone,housing,machine,pyrim,stock,bank1-5,bank2-5,calhousing-5,census1-5,census2-5,computer1-5,computer2-5
+basedir = ../../../datasets/discretized-regression/5bins
+datasets = all
 standarize = true
 num_folds = 5
 cvmetric = mze
@@ -102,8 +102,8 @@ k = 10.^(-3:1:3)
 [cssvc-mze-regression10]
 {general-conf}
 seed = 1
-basedir = ../../../datasets/ordinal/regression/10bins
-datasets = abalone10,housing10,machine10,pyrim10,stock10bank1-10,bank2-10,calhousing-10,census1-10,census2-10,computer1-10,computer2-10
+basedir = ../../../datasets/discretized-regression/10bins
+datasets = all
 standarize = true
 num_folds = 5
 cvmetric = mze