Skip to content

Commit

Permalink
FEATURE easily add new components
Browse files Browse the repository at this point in the history
  • Loading branch information
mfeurer committed Jan 19, 2016
1 parent 4d7fec3 commit 6adb0de
Show file tree
Hide file tree
Showing 62 changed files with 508 additions and 770 deletions.
20 changes: 4 additions & 16 deletions autosklearn/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,8 @@ def pre_transform(self, X, y, fit_params=None, init_params=None):
method, param = init_param.split(":")
init_params_per_method[method][param] = value

# List of preprocessing steps (and their order)
preprocessors_names = [preprocessor[0] for
preprocessor in self._get_pipeline()[:-1]]

for preproc_name in preprocessors_names:
# Instantiate preprocessor objects
for preproc_name, preproc_class in self._get_pipeline()[:-1]:
preproc_params = {}
for instantiated_hyperparameter in self.configuration:
if not instantiated_hyperparameter.startswith(
Expand All @@ -92,20 +89,11 @@ def pre_transform(self, X, y, fit_params=None, init_params=None):
preproc_params[name_] = self.configuration[
instantiated_hyperparameter]

if preproc_name in \
components.feature_preprocessing_components._preprocessors:
_preprocessors = components.feature_preprocessing_components._preprocessors
elif preproc_name in \
components.data_preprocessing_components._preprocessors:
_preprocessors = components.data_preprocessing_components._preprocessors
else:
raise ValueError(preproc_name)

preprocessor_object = _preprocessors[preproc_name](
preprocessor_object = preproc_class(
random_state=self.random_state, **preproc_params)

# Ducktyping...
if hasattr(preprocessor_object, 'get_components'):
if hasattr(preproc_class, 'get_components'):
preprocessor_object = preprocessor_object.choice

steps.append((preproc_name, preprocessor_object))
Expand Down
4 changes: 2 additions & 2 deletions autosklearn/pipeline/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,11 @@ def _get_pipeline(cls):

# Add the preprocessing component
steps.append(['preprocessor',
components.feature_preprocessing._preprocessors['preprocessor']])
components.feature_preprocessing.FeaturePreprocessorChoice])

# Add the classification component
steps.append(['classifier',
components.classification_components._classifiers['classifier']])
components.classification_components.ClassifierChoice])
return steps

def _get_estimator_hyperparameter_name(self):
Expand Down
39 changes: 0 additions & 39 deletions autosklearn/pipeline/components/__init__.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,3 @@
"""auto-sklearn can be easily extended with new classification and
preprocessing methods. At import time, auto-sklearn checks the directory
``autosklearn/pipeline/components/classification`` for classification
algorithms and ``autosklearn/pipeline/components/preprocessing`` for
preprocessing algorithms. To be found, the algorithm must be provide a class
implementing one of the given
interfaces.
Coding Guidelines
=================
Please try to adhere to the `scikit-learn coding guidelines <http://scikit-learn.org/stable/developers/index.html#contributing>`_.
Own Implementation of Algorithms
================================
When adding new algorithms, it is possible to implement it directly in the
fit/predict/transform method of a component. We do not recommend this,
but rather recommend to implement an algorithm in a scikit-learn compatible
way (`see here <http://scikit-learn.org/stable/developers/index.html#apis-of-scikit-learn-objects>`_).
Such an implementation should then be put into the `implementation` directory.
and can then be easily wrapped with to become a component in auto-sklearn.
Classification
==============
The SimpleClassificationPipeline provides an interface for
Classification Algorithms inside auto-sklearn. It provides four important
functions. Two of them,
:meth:`get_hyperparameter_search_space() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.get_hyperparameter_search_space>`
and
:meth:`get_properties() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.get_properties>`
are used to
automatically create a valid configuration space. The other two,
:meth:`fit() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.fit>` and
:meth:`predict() <autosklearn.pipeline.components.classification_base.SimpleClassificationPipeline.predict>`
are an implementation of the `scikit-learn predictor API <http://scikit-learn.org/stable/developers/index.html#apis-of-scikit-learn-objects>`_.
Preprocessing
============="""

from . import classification as classification_components
from . import regression as regression_components
from . import feature_preprocessing as feature_preprocessing_components
Expand Down
206 changes: 110 additions & 96 deletions autosklearn/pipeline/components/base.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,87 @@
from collections import OrderedDict
import importlib
import inspect
import pkgutil
import sys


def find_components(package, directory, base_class):
components = OrderedDict()

for module_loader, module_name, ispkg in pkgutil.iter_modules(
[directory]):
full_module_name = "%s.%s" % (package, module_name)
if full_module_name not in sys.modules and not ispkg:
module = importlib.import_module(full_module_name)

for member_name, obj in inspect.getmembers(module):
if inspect.isclass(
obj) and base_class in obj.__bases__:
# TODO test if the obj implements the interface
# Keep in mind that this only instantiates the ensemble_wrapper,
# but not the real target classifier
classifier = obj
components[module_name] = classifier

return components


class ThirdPartyComponents(object):
def __init__(self, base_class):
self.base_class = base_class
self.components = OrderedDict()

def add_component(self, obj):
if inspect.isclass(obj) and self.base_class in obj.__bases__:
name = obj.__name__
classifier = obj
else:
raise TypeError('add_component works only with a subclass of %s' %
str(self.base_class))

properties = set(classifier.get_properties())
should_be_there = set(['shortname',
'name',
'handles_regression',
'handles_classification',
'handles_multiclass',
'handles_multilabel',
'is_deterministic',
'input', 'output'])
for property in properties:
if property not in should_be_there:
raise ValueError('Property %s must not be specified for '
'algorithm %s. Only the following properties '
'can be specified: %s' %
(property, name, str(should_be_there)))
for property in should_be_there:
if property not in properties:
raise ValueError('Property %s not specified for algorithm %s')

self.components[name] = classifier
print(name, classifier)


class AutoSklearnClassificationAlgorithm(object):
"""Provide an abstract interface for classification algorithms in
auto-sklearn.
Make a subclass of this and put it into the directory
`autosklearn/pipeline/components/classification` to make it available."""
See :ref:`extending` for more information."""

def __init__(self):
self.estimator = None
self.properties = None

@staticmethod
def get_properties(dataset_properties=None):
"""Get the properties of the underlying algorithm. These are:
* Short name
* Full name
* Can the algorithm handle missing values?
(handles_missing_values : {True, False})
* Can the algorithm handle nominal features?
(handles_nominal_features : {True, False})
* Can the algorithm handle numerical features?
(handles_numerical_features : {True, False})
* Does the algorithm prefer data scaled in [0,1]?
(prefers_data_scaled : {True, False}
* Does the algorithm prefer data normalized to 0-mean, 1std?
(prefers_data_normalized : {True, False}
* Can the algorithm handle multiclass-classification problems?
(handles_multiclass : {True, False})
* Can the algorithm handle multilabel-classification problems?
(handles_multilabel : {True, False}
* Is the algorithm deterministic for a given seed?
(is_deterministic : {True, False)
* Can the algorithm handle sparse data?
(handles_sparse : {True, False}
* What are the preferred types of the data array?
(preferred_dtype : list of tuples)
"""Get the properties of the underlying algorithm.
Find more information at :ref:`get_properties`
Paramaters
----------
dataset_properties : dict, optional (default=None)
Returns
-------
Expand All @@ -46,6 +93,11 @@ def get_properties(dataset_properties=None):
def get_hyperparameter_search_space(dataset_properties=None):
"""Return the configuration space of this classification algorithm.
Paramaters
----------
dataset_properties : dict, optional (default=None)
Returns
-------
HPOlibConfigspace.configuration_space.ConfigurationSpace
Expand All @@ -62,7 +114,7 @@ def fit(self, X, y):
X : array-like, shape = (n_samples, n_features)
Training data
y : array-like, shape = [n_samples]
y : array-like, shape = (n_samples,) or shape = (n_sample, n_labels)
Returns
-------
Expand All @@ -86,7 +138,7 @@ def predict(self, X):
Returns
-------
array, shape = (n_samples,)
array, shape = (n_samples,) or shape = (n_samples, n_labels)
Returns the predicted values
Notes
Expand Down Expand Up @@ -127,42 +179,21 @@ class AutoSklearnPreprocessingAlgorithm(object):
"""Provide an abstract interface for preprocessing algorithms in
auto-sklearn.
Make a subclass of this and put it into the directory
`autosklearn/pipeline/components/preprocessing` to make it available."""
See :ref:`extending` for more information."""

def __init__(self):
self.preprocessor = None

@staticmethod
def get_properties(dataset_properties=None):
"""Get the properties of the underlying algorithm. These are:
* Short name
* Full name
* Can the algorithm handle missing values?
(handles_missing_values : {True, False})
* Can the algorithm handle nominal features?
(handles_nominal_features : {True, False})
* Can the algorithm handle numerical features?
(handles_numerical_features : {True, False})
* Does the algorithm prefer data scaled in [0,1]?
(prefers_data_scaled : {True, False}
* Does the algorithm prefer data normalized to 0-mean, 1std?
(prefers_data_normalized : {True, False}
* Can preprocess regression data?
(handles_regression : {True, False}
* Can preprocess classification data?
(handles_classification : {True, False}
* Can the algorithm handle multiclass-classification problems?
(handles_multiclass : {True, False})
* Can the algorithm handle multilabel-classification problems?
(handles_multilabel : {True, False}
* Is the algorithm deterministic for a given seed?
(is_deterministic : {True, False)
* Can the algorithm handle sparse data?
(handles_sparse : {True, False}
* What are the preferred types of the data array?
(preferred_dtype : list of tuples)
"""Get the properties of the underlying algorithm.
Find more information at :ref:`get_properties`
Paramaters
----------
dataset_properties : dict, optional (default=None)
Returns
-------
Expand All @@ -174,6 +205,11 @@ def get_properties(dataset_properties=None):
def get_hyperparameter_search_space(dataset_properties=None):
"""Return the configuration space of this preprocessing algorithm.
Paramaters
----------
dataset_properties : dict, optional (default=None)
Returns
-------
HPOlibConfigspace.configuration_space.ConfigurationSpace
Expand All @@ -190,7 +226,7 @@ def fit(self, X, Y):
X : array-like, shape = (n_samples, n_features)
Training data
y : array-like, shape = [n_samples]
y : array-like, shape = (n_samples,) or shape = (n_sample, n_labels)
Returns
-------
Expand Down Expand Up @@ -234,7 +270,7 @@ def get_preprocessor(self):

def __str__(self):
name = self.get_properties()['name']
return "autosklearn.pipeline %" % name
return "autosklearn.pipeline %s" % name


class AutoSklearnRegressionAlgorithm(object):
Expand All @@ -248,28 +284,15 @@ def __init__(self):
self.estimator = None
self.properties = None

@staticmethod
def get_properties(dataset_properties=None):
"""Get the properties of the underlying algorithm. These are:
* Short name
* Full name
* Can the algorithm handle missing values?
(handles_missing_values : {True, False})
* Can the algorithm handle nominal features?
(handles_nominal_features : {True, False})
* Can the algorithm handle numerical features?
(handles_numerical_features : {True, False})
* Does the algorithm prefer data scaled in [0,1]?
(prefers_data_scaled : {True, False}
* Does the algorithm prefer data normalized to 0-mean, 1std?
(prefers_data_normalized : {True, False}
* Is the algorithm deterministic for a given seed?
(is_deterministic : {True, False)
* Can the algorithm handle sparse data?
(handles_sparse : {True, False}
* What are the preferred types of the data array?
(preferred_dtype : list of tuples)
"""Get the properties of the underlying algorithm.
Find more information at :ref:`get_properties`
Paramaters
----------
dataset_properties : dict, optional (default=None)
Returns
-------
Expand All @@ -281,6 +304,11 @@ def get_properties(dataset_properties=None):
def get_hyperparameter_search_space(dataset_properties=None):
"""Return the configuration space of this regression algorithm.
Paramaters
----------
dataset_properties : dict, optional (default=None)
Returns
-------
HPOlibConfigspace.configuration_space.ConfigurationSpace
Expand Down Expand Up @@ -331,19 +359,6 @@ def predict(self, X):
-learn-objects>`_ for further information."""
raise NotImplementedError()

def predict_proba(self, X):
"""Predict probabilities.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Returns
-------
array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
"""
raise NotImplementedError()

def get_estimator(self):
"""Return the underlying estimator object.
Expand All @@ -355,6 +370,5 @@ def get_estimator(self):

def __str__(self):
name = self.get_properties()['name']
return "autosklearn.pipeline %" % name

return "autosklearn.pipeline %s" % name

Loading

0 comments on commit 6adb0de

Please sign in to comment.