FEATURE easily add new components

zhangweijiqn · Jan 19, 2016 · 6adb0de · 6adb0de
1 parent 4d7fec3
commit 6adb0de
Show file tree

Hide file tree

Showing 62 changed files with 508 additions and 770 deletions.
diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py
@@ -75,11 +75,8 @@ def pre_transform(self, X, y, fit_params=None, init_params=None):
                 method, param = init_param.split(":")
                 init_params_per_method[method][param] = value
 
-        # List of preprocessing steps (and their order)
-        preprocessors_names = [preprocessor[0] for
-                               preprocessor in self._get_pipeline()[:-1]]
-
-        for preproc_name in preprocessors_names:
+        # Instantiate preprocessor objects
+        for preproc_name, preproc_class in self._get_pipeline()[:-1]:
             preproc_params = {}
             for instantiated_hyperparameter in self.configuration:
                 if not instantiated_hyperparameter.startswith(
@@ -92,20 +89,11 @@ def pre_transform(self, X, y, fit_params=None, init_params=None):
                 preproc_params[name_] = self.configuration[
                     instantiated_hyperparameter]
 
-            if preproc_name in \
-                    components.feature_preprocessing_components._preprocessors:
-                _preprocessors = components.feature_preprocessing_components._preprocessors
-            elif preproc_name in \
-                    components.data_preprocessing_components._preprocessors:
-                _preprocessors = components.data_preprocessing_components._preprocessors
-            else:
-                raise ValueError(preproc_name)
-
-            preprocessor_object = _preprocessors[preproc_name](
+            preprocessor_object = preproc_class(
                 random_state=self.random_state, **preproc_params)
 
             # Ducktyping...
-            if hasattr(preprocessor_object, 'get_components'):
+            if hasattr(preproc_class, 'get_components'):
                 preprocessor_object = preprocessor_object.choice
 
             steps.append((preproc_name, preprocessor_object))

diff --git a/autosklearn/pipeline/classification.py b/autosklearn/pipeline/classification.py
@@ -285,11 +285,11 @@ def _get_pipeline(cls):
 
         # Add the preprocessing component
         steps.append(['preprocessor',
-                      components.feature_preprocessing._preprocessors['preprocessor']])
+                      components.feature_preprocessing.FeaturePreprocessorChoice])
 
         # Add the classification component
         steps.append(['classifier',
-                      components.classification_components._classifiers['classifier']])
+                      components.classification_components.ClassifierChoice])
         return steps
 
     def _get_estimator_hyperparameter_name(self):

diff --git a/autosklearn/pipeline/components/__init__.py b/autosklearn/pipeline/components/__init__.py
@@ -1,42 +1,3 @@
-"""auto-sklearn can be easily extended with new classification and
-preprocessing methods. At import time, auto-sklearn checks the directory
-``autosklearn/pipeline/components/classification`` for classification
-algorithms and ``autosklearn/pipeline/components/preprocessing`` for
-preprocessing algorithms. To be found, the algorithm must be provide a class
-implementing one of the given
-interfaces.
-
-Coding Guidelines
-=================
-Please try to adhere to the `scikit-learn coding guidelines <http://scikit-learn.org/stable/developers/index.html#contributing>`_.
-
-Own Implementation of Algorithms
-================================
-When adding new algorithms, it is possible to implement it directly in the
-fit/predict/transform method of a component. We do not recommend this,
-but rather recommend to implement an algorithm in a scikit-learn compatible
-way (`see here <http://scikit-learn.org/stable/developers/index.html#apis-of-scikit-learn-objects>`_).
-Such an implementation should then be put into the `implementation` directory.
-and can then be easily wrapped with to become a component in auto-sklearn.
-
-Classification
-==============
-
-The SimpleClassificationPipeline provides an interface for
-Classification Algorithms inside auto-sklearn. It provides four important
-functions. Two of them,
-:meth:`get_hyperparameter_search_space() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.get_hyperparameter_search_space>`
-and
-:meth:`get_properties() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.get_properties>`
-are used to
-automatically create a valid configuration space. The other two,
-:meth:`fit() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.fit>` and
-:meth:`predict() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.predict>`
-are an implementation of the `scikit-learn predictor API <http://scikit-learn.org/stable/developers/index.html#apis-of-scikit-learn-objects>`_.
-
-Preprocessing
-============="""
-
 from . import classification as classification_components
 from . import regression as regression_components
 from . import feature_preprocessing as feature_preprocessing_components

diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py
@@ -1,40 +1,87 @@
+from collections import OrderedDict
+import importlib
+import inspect
+import pkgutil
+import sys
+
+
+def find_components(package, directory, base_class):
+    components = OrderedDict()
+
+    for module_loader, module_name, ispkg in pkgutil.iter_modules(
+            [directory]):
+        full_module_name = "%s.%s" % (package, module_name)
+        if full_module_name not in sys.modules and not ispkg:
+            module = importlib.import_module(full_module_name)
+
+            for member_name, obj in inspect.getmembers(module):
+                if inspect.isclass(
+                        obj) and base_class in obj.__bases__:
+                    # TODO test if the obj implements the interface
+                    # Keep in mind that this only instantiates the ensemble_wrapper,
+                    # but not the real target classifier
+                    classifier = obj
+                    components[module_name] = classifier
+
+    return components
+
+
+class ThirdPartyComponents(object):
+    def __init__(self, base_class):
+        self.base_class = base_class
+        self.components = OrderedDict()
+
+    def add_component(self, obj):
+        if inspect.isclass(obj) and self.base_class in obj.__bases__:
+            name = obj.__name__
+            classifier = obj
+        else:
+            raise TypeError('add_component works only with a subclass of %s' %
+                            str(self.base_class))
+
+        properties = set(classifier.get_properties())
+        should_be_there = set(['shortname',
+                               'name',
+                               'handles_regression',
+                               'handles_classification',
+                               'handles_multiclass',
+                               'handles_multilabel',
+                               'is_deterministic',
+                               'input', 'output'])
+        for property in properties:
+            if property not in should_be_there:
+                raise ValueError('Property %s must not be specified for '
+                                 'algorithm %s. Only the following properties '
+                                 'can be specified: %s' %
+                                 (property, name, str(should_be_there)))
+        for property in should_be_there:
+            if property not in properties:
+                raise ValueError('Property %s not specified for algorithm %s')
+
+        self.components[name] = classifier
+        print(name, classifier)
+
+
 class AutoSklearnClassificationAlgorithm(object):
     """Provide an abstract interface for classification algorithms in
     auto-sklearn.
 
-    Make a subclass of this and put it into the directory
-    `autosklearn/pipeline/components/classification` to make it available."""
+    See :ref:`extending` for more information."""
 
     def __init__(self):
         self.estimator = None
         self.properties = None
 
     @staticmethod
     def get_properties(dataset_properties=None):
-        """Get the properties of the underlying algorithm. These are:
-
-        * Short name
-        * Full name
-        * Can the algorithm handle missing values?
-          (handles_missing_values : {True, False})
-        * Can the algorithm handle nominal features?
-          (handles_nominal_features : {True, False})
-        * Can the algorithm handle numerical features?
-          (handles_numerical_features : {True, False})
-        * Does the algorithm prefer data scaled in [0,1]?
-          (prefers_data_scaled : {True, False}
-        * Does the algorithm prefer data normalized to 0-mean, 1std?
-          (prefers_data_normalized : {True, False}
-        * Can the algorithm handle multiclass-classification problems?
-          (handles_multiclass : {True, False})
-        * Can the algorithm handle multilabel-classification problems?
-          (handles_multilabel : {True, False}
-        * Is the algorithm deterministic for a given seed?
-          (is_deterministic : {True, False)
-        * Can the algorithm handle sparse data?
-          (handles_sparse : {True, False}
-        * What are the preferred types of the data array?
-          (preferred_dtype : list of tuples)
+        """Get the properties of the underlying algorithm.
+
+         Find more information at :ref:`get_properties`
+
+        Paramaters
+        ----------
+
+        dataset_properties : dict, optional (default=None)
 
         Returns
         -------
@@ -46,6 +93,11 @@ def get_properties(dataset_properties=None):
     def get_hyperparameter_search_space(dataset_properties=None):
         """Return the configuration space of this classification algorithm.
 
+        Paramaters
+        ----------
+
+        dataset_properties : dict, optional (default=None)
+
         Returns
         -------
         HPOlibConfigspace.configuration_space.ConfigurationSpace
@@ -62,7 +114,7 @@ def fit(self, X, y):
         X : array-like, shape = (n_samples, n_features)
             Training data
 
-        y : array-like, shape = [n_samples]
+        y : array-like, shape = (n_samples,) or shape = (n_sample, n_labels)
 
         Returns
         -------
@@ -86,7 +138,7 @@ def predict(self, X):
 
         Returns
         -------
-        array, shape = (n_samples,)
+        array, shape = (n_samples,) or shape = (n_samples, n_labels)
             Returns the predicted values
 
         Notes
@@ -127,42 +179,21 @@ class AutoSklearnPreprocessingAlgorithm(object):
     """Provide an abstract interface for preprocessing algorithms in
     auto-sklearn.
 
-    Make a subclass of this and put it into the directory
-    `autosklearn/pipeline/components/preprocessing` to make it available."""
+    See :ref:`extending` for more information."""
 
     def __init__(self):
         self.preprocessor = None
 
     @staticmethod
     def get_properties(dataset_properties=None):
-        """Get the properties of the underlying algorithm. These are:
-
-        * Short name
-        * Full name
-        * Can the algorithm handle missing values?
-          (handles_missing_values : {True, False})
-        * Can the algorithm handle nominal features?
-          (handles_nominal_features : {True, False})
-        * Can the algorithm handle numerical features?
-          (handles_numerical_features : {True, False})
-        * Does the algorithm prefer data scaled in [0,1]?
-          (prefers_data_scaled : {True, False}
-        * Does the algorithm prefer data normalized to 0-mean, 1std?
-          (prefers_data_normalized : {True, False}
-        * Can preprocess regression data?
-          (handles_regression : {True, False}
-        * Can preprocess classification data?
-          (handles_classification : {True, False}
-        * Can the algorithm handle multiclass-classification problems?
-          (handles_multiclass : {True, False})
-        * Can the algorithm handle multilabel-classification problems?
-          (handles_multilabel : {True, False}
-        * Is the algorithm deterministic for a given seed?
-          (is_deterministic : {True, False)
-        * Can the algorithm handle sparse data?
-          (handles_sparse : {True, False}
-        * What are the preferred types of the data array?
-          (preferred_dtype : list of tuples)
+        """Get the properties of the underlying algorithm.
+
+         Find more information at :ref:`get_properties`
+
+        Paramaters
+        ----------
+
+        dataset_properties : dict, optional (default=None)
 
         Returns
         -------
@@ -174,6 +205,11 @@ def get_properties(dataset_properties=None):
     def get_hyperparameter_search_space(dataset_properties=None):
         """Return the configuration space of this preprocessing algorithm.
 
+        Paramaters
+        ----------
+
+        dataset_properties : dict, optional (default=None)
+
         Returns
         -------
         HPOlibConfigspace.configuration_space.ConfigurationSpace
@@ -190,7 +226,7 @@ def fit(self, X, Y):
         X : array-like, shape = (n_samples, n_features)
             Training data
 
-        y : array-like, shape = [n_samples]
+        y : array-like, shape = (n_samples,) or shape = (n_sample, n_labels)
 
         Returns
         -------
@@ -234,7 +270,7 @@ def get_preprocessor(self):
 
     def __str__(self):
         name = self.get_properties()['name']
-        return "autosklearn.pipeline %" % name
+        return "autosklearn.pipeline %s" % name
 
 
 class AutoSklearnRegressionAlgorithm(object):
@@ -248,28 +284,15 @@ def __init__(self):
         self.estimator = None
         self.properties = None
 
-    @staticmethod
     def get_properties(dataset_properties=None):
-        """Get the properties of the underlying algorithm. These are:
-
-        * Short name
-        * Full name
-        * Can the algorithm handle missing values?
-          (handles_missing_values : {True, False})
-        * Can the algorithm handle nominal features?
-          (handles_nominal_features : {True, False})
-        * Can the algorithm handle numerical features?
-          (handles_numerical_features : {True, False})
-        * Does the algorithm prefer data scaled in [0,1]?
-          (prefers_data_scaled : {True, False}
-        * Does the algorithm prefer data normalized to 0-mean, 1std?
-          (prefers_data_normalized : {True, False}
-        * Is the algorithm deterministic for a given seed?
-          (is_deterministic : {True, False)
-        * Can the algorithm handle sparse data?
-          (handles_sparse : {True, False}
-        * What are the preferred types of the data array?
-          (preferred_dtype : list of tuples)
+        """Get the properties of the underlying algorithm.
+
+         Find more information at :ref:`get_properties`
+
+        Paramaters
+        ----------
+
+        dataset_properties : dict, optional (default=None)
 
         Returns
         -------
@@ -281,6 +304,11 @@ def get_properties(dataset_properties=None):
     def get_hyperparameter_search_space(dataset_properties=None):
         """Return the configuration space of this regression algorithm.
 
+        Paramaters
+        ----------
+
+        dataset_properties : dict, optional (default=None)
+
         Returns
         -------
         HPOlibConfigspace.configuration_space.ConfigurationSpace
@@ -331,19 +359,6 @@ def predict(self, X):
         -learn-objects>`_ for further information."""
         raise NotImplementedError()
 
-    def predict_proba(self, X):
-        """Predict probabilities.
-
-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-
-        Returns
-        -------
-        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
-        """
-        raise NotImplementedError()
-
     def get_estimator(self):
         """Return the underlying estimator object.
 
@@ -355,6 +370,5 @@ def get_estimator(self):
 
     def __str__(self):
         name = self.get_properties()['name']
-        return "autosklearn.pipeline %" % name
-
+        return "autosklearn.pipeline %s" % name