Merge pull request #879 from alan-turing-institute/dev

For a 0.17 release
JuliaAI · Dec 29, 2021 · f891cf8 · f891cf8
2 parents 11d0369 + d977e84
commit f891cf8
Show file tree

Hide file tree

Showing 30 changed files with 472 additions and 369 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.3'
+          - '1.6'
           - '1' # automatically expands to the latest stable 1.x release of Julia.
         os:
           - ubuntu-latest

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJ"
 uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.16.11"
+version = "0.17.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -28,18 +28,18 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 CategoricalArrays = "0.8,0.9, 0.10"
 ComputationalResources = "0.3"
 Distributions = "0.21,0.22,0.23, 0.24, 0.25"
-MLJBase = "0.18.10"
-MLJEnsembles = "0.1"
-MLJIteration = "0.3"
-MLJModels = "0.14"
+MLJBase = "0.19"
+MLJEnsembles = "0.2"
+MLJIteration = "0.4"
+MLJModels = "0.15"
 MLJSerialization = "1.1"
 MLJTuning = "0.6"
-OpenML = "0.1"
+OpenML = "0.2"
 ProgressMeter = "1.1"
-ScientificTypes = "2"
+ScientificTypes = "3"
 StatsBase = "0.32,0.33"
 Tables = "0.2,1.0"
-julia = "1.3"
+julia = "1.6"
 
 [extras]
 NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
+CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
@@ -29,12 +30,13 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
 
 [compat]
+CategoricalDistributions = "0.1"
 Documenter = "0.26"
-MLJBase = "0.18"
-MLJEnsembles = "0.1"
-MLJIteration = "0.3"
-MLJModels = "0.14.4"
+MLJBase = "0.19"
+MLJEnsembles = "0.2"
+MLJIteration = "0.4"
+MLJModels = "0.15"
 MLJTuning = "0.6.5"
-ScientificTypes = "2.1"
-ScientificTypesBase = "2.1"
-julia = "1"
+ScientificTypes = "3"
+ScientificTypesBase = "3"
+julia = "1.6"
diff --git a/docs/make.jl b/docs/make.jl
@@ -19,6 +19,7 @@ import ScientificTypesBase
 import Distributions
 using CategoricalArrays # avoid types like CategoricalArrays.Categorica
 using LossFunctions
+import CategoricalDistributions
 
 const MMI = MLJModelInterface
 
@@ -45,6 +46,7 @@ pages = [
     "More on Probablistic Predictors" => "more_on_probabilistic_predictors.md",
     "Composing Models" => "composing_models.md",
     "Linear Pipelines" => "linear_pipelines.md",
+    "Target Transformations" => "target_transformations.md",
     "Homogeneous Ensembles" => "homogeneous_ensembles.md",
     "Model Stacking" => "model_stacking.md",
     "Controlling Iterative Models" => "controlling_iterative_models.md",
@@ -85,7 +87,8 @@ makedocs(
                 MLJIteration,
                 MLJSerialization,
                 EarlyStopping,
-                IterationControl],
+                IterationControl,
+                CategoricalDistributions],
     pages    = pages)
 
 # By default Documenter does not deploy docs just for PR

diff --git a/docs/src/about_mlj.md b/docs/src/about_mlj.md
@@ -45,7 +45,7 @@ Loading and instantiating a gradient tree-boosting model:
 using MLJ
 Booster = @load EvoTreeRegressor # loads code defining a model type
 booster = Booster(max_depth=2)   # specify hyper-parameter at construction
-booster.nrounds=50               # or mutate post facto
+booster.nrounds=50               # or mutate afterwards
 ```
 
 This model is an example of an iterative model. As is stands, the
@@ -72,7 +72,7 @@ iterated_booster = IteratedModel(model=booster,
 Combining the model with categorical feature encoding:
 
 ```julia
-pipe = @pipeline ContinuousEncoder iterated_booster
+pipe = ContinuousEncoder() |> iterated_booster
 ```
 
 #### Composition 3: Wrapping the model to make it "self-tuning"
@@ -108,34 +108,26 @@ House Price dataset:
 ```julia
 X, y = @load_reduced_ames;
 ```
-
-Binding the "self-tuning" pipeline model to data in a *machine* (which
-will additionally store *learned* parameters):
-
-```julia
-mach = machine(self_tuning_pipe, X, y)
-```
-
 Evaluating the "self-tuning" pipeline model's performance using 5-fold
 cross-validation (implies multiple layers of nested resampling):
 
 ```julia
-julia> evaluate!(mach,
-                 measures=[l1, l2],
-                 resampling=CV(nfolds=5, rng=123),
-                 acceleration=CPUThreads(),
-                 verbosity=2)
+julia> evaluate(self_tuning_pipe, X, y,
+                measures=[l1, l2],
+                resampling=CV(nfolds=5, rng=123),
+                acceleration=CPUThreads(),
+                verbosity=2)
 PerformanceEvaluation object with these fields:
   measure, measurement, operation, per_fold,
   per_observation, fitted_params_per_fold,
   report_per_fold, train_test_pairs
 Extract:
-┌────────────────────┬─────────────┬───────────┬───────────────────────────────────────────────┐
-│ measure            │ measurement │ operation │ per_fold                                      │
-├────────────────────┼─────────────┼───────────┼───────────────────────────────────────────────┤
-│ LPLoss(p = 1) @638 │ 16800.0     │ predict   │ [16500.0, 16300.0, 16300.0, 16600.0, 18600.0] │
-│ LPLoss(p = 2) @308 │ 6.65e8      │ predict   │ [6.14e8, 6.3e8, 5.98e8, 6.17e8, 8.68e8]       │
-└────────────────────┴─────────────┴───────────┴───────────────────────────────────────────────┘
+┌───────────────┬─────────────┬───────────┬───────────────────────────────────────────────┐
+│ measure       │ measurement │ operation │ per_fold                                      │
+├───────────────┼─────────────┼───────────┼───────────────────────────────────────────────┤
+│ LPLoss(p = 1) │ 17200.0     │ predict   │ [16500.0, 17100.0, 16300.0, 17500.0, 18900.0] │
+│ LPLoss(p = 2) │ 6.83e8      │ predict   │ [6.14e8, 6.64e8, 5.98e8, 6.37e8, 9.03e8]      │
+└───────────────┴─────────────┴───────────┴───────────────────────────────────────────────┘
 ```
 
 Try out MLJ yourself in the following batteries-included Binder
@@ -271,7 +263,7 @@ julia> tree = Tree() # instance
 
 where you will also be asked to choose a providing package, for more
 than one provide a `DecisionTreeClassifier` model. For more on
-identifying the name of an applicable model, see [Model Search](@ref model_search). 
+identifying the name of an applicable model, see [Model Search](@ref model_search).
 For non-interactive loading of code (e.g., from a
 module or function) see [Loading Model Code](@ref).
 
@@ -360,7 +352,7 @@ An in-depth view of MLJ's model composition design:
 
 ```bibtex
 @misc{blaom2020flexible,
-      title={Flexible model composition in machine learning and its implementation in {MLJ}}, 
+      title={Flexible model composition in machine learning and its implementation in {MLJ}},
       author={Anthony D. Blaom and Sebastian J. Vollmer},
       year={2020},
       eprint={2012.15505},

diff --git a/docs/src/adding_models_for_general_use.md b/docs/src/adding_models_for_general_use.md
@@ -545,24 +545,20 @@ probablility distribution `p(X |y)` above).
 #### Prediction types for deterministic responses.
 
 In the case of `Deterministic` models, `yhat` should have the same
-scitype as the `y` passed to `fit` (see above). Any `CategoricalValue`
-elements of `yhat` **must have a pool == to the
-pool of the target `y` presented in training**, even if not all levels
-appear in the training data or prediction itself. For example, in the
-case of a univariate target, such as `scitype(y) <:
-AbstractVector{Multiclass{3}}`, one requires `MLJ.classes(yhat[i]) ==
-MLJ.classes(y[j])` for all admissible `i` and `j`. (The method
-`classes` is described under [Convenience methods](@ref) below).
+scitype as the `y` passed to `fit` (see above). If `y` is a
+`CategoricalVector` (classification) then elements of the predition
+`yhat` **must have a pool == to the pool of the target `y` presented
+in training**, even if not all levels appear in the training data or
+prediction itself.
 
 Unfortunately, code not written with the preservation of categorical
 levels in mind poses special problems. To help with this,
-MLJModelInterface provides three utility methods: `int` (for
-converting a `CategoricalValue` into an integer, the ordering of these
-integers being consistent with that of the pool), `decoder` (for
-constructing a callable object that decodes the integers back into
-`CategoricalValue` objects), and `classes`, for extracting all the
-`CategoricalValue` objects sharing the pool of a particular
-value. Refer to [Convenience methods](@ref) below for important
+MLJModelInterface provides some utilities:
+[`MLJModelInterface.int`](@ref) (for converting a `CategoricalValue`
+into an integer, the ordering of these integers being consistent with
+that of the pool) and `MLJModelInterface.decoder` (for constructing a
+callable object that decodes the integers back into `CategoricalValue`
+objects). Refer to [Convenience methods](@ref) below for important
 details.
 
 Note that a decoder created during `fit` may need to be bundled with
@@ -574,7 +570,7 @@ may look something like this:
 ```julia
 function MMI.fit(model::SomeSupervisedModel, verbosity, X, y)
     yint = MMI.int(y)
-    a_target_element = y[1]                    # a CategoricalValue/String
+    a_target_element = y[1]                # a CategoricalValue/String
     decode = MMI.decoder(a_target_element) # can be called on integers
 
     core_fitresult = SomePackage.fit(X, yint, verbosity=verbosity)
@@ -592,7 +588,7 @@ while a corresponding deterministic `predict` operation might look like this:
 function MMI.predict(model::SomeSupervisedModel, fitresult, Xnew)
     decode, core_fitresult = fitresult
     yhat = SomePackage.predict(core_fitresult, Xnew)
-    return decode.(yhat)  # or decode(yhat) also works
+    return decode.(yhat)
 end
 ```
 
@@ -607,23 +603,43 @@ than wrapping an existing one, these extra measures may be unnecessary.
 #### Prediction types for probabilistic responses
 
 In the case of `Probabilistic` models with univariate targets, `yhat`
-must be an `AbstractVector` whose elements are distributions (one distribution
-per row of `Xnew`).
-
-Presently, a *distribution* is any object `d` for which
-`MMI.isdistribution(::d) = true`, which is the case for objects of
-type `Distributions.Sampleable`.
-
-Use the distribution `MMI.UnivariateFinite` for `Probabilistic` models
-predicting a target with `Finite` scitype (classifiers). In this case
-the eltype of the training target `y` will be a `CategoricalValue`.
-
-For efficiency, one should not construct `UnivariateDistribution`
-instances one at a time. Rather, once a probability vector or matrix
-is known, construct an instance of `UnivariateFiniteVector <:
-AbstractArray{<:UnivariateFinite},1}` to return. Both `UnivariateFinite`
-and `UnivariateFiniteVector` objects are constructed using the single
-`UnivariateFinite` function.
+must be an `AbstractVector` or table whose elements are distributions.
+In the common case of a vector (single target), this means one
+distribution per row of `Xnew`.
+
+A *distribution* is some object that, at the least, implements
+`Base.rng` (i.e., is something that can be sampled).  Currently, all
+performance measures (metrics) defined in MLJBase.jl additionally
+assume that a distribution is either:
+
+- An instance of some subtype of `Distributions.Distribution`, an
+  abstract type defined in the
+  [`Distributions.jl`](https://juliastats.org/Distributions.jl/stable/)
+  package; or
+
+- An instance of `CategoricalDistributions.UnivariateFinite`, from the
+  [CategoricalDistributions.jl](https://github.com/JuliaAI/CategoricalDistributions.jl)
+  package, *which should be used for all probabilistic classifiers*,
+  i.e., for predictors whose target has scientific type
+  `<:AbstractVector{<:Finite}`.
+
+All such distributions implement the probability mass or density
+function `Distributions.pdf`. If your model's predictions cannot be
+predict objects of this form, then you will need to implement
+appropriate performance measures to buy into MLJ's performance
+evaluation apparatus.
+
+An implementation can avoid CategoricalDistributions.jl as a
+dependency by using the "dummy" constructor
+`MLJModelInterface.UnivariateFinite`, which is bound to the true one
+when MLJBase.jl is loaded.
+
+For efficiency, one should not construct `UnivariateFinite` instances
+one at a time. Rather, once a probability vector, matrix, or
+dictionary is known, construct an instance of `UnivariateFiniteVector
+<: AbstractArray{<:UnivariateFinite},1}` to return. Both
+`UnivariateFinite` and `UnivariateFiniteVector` objects are
+constructed using the single `UnivariateFinite` function.
 
 For example, suppose the target `y` arrives as a subsample of some
 `ybig` and is missing some classes:
@@ -641,7 +657,7 @@ classes `[:a, :b]` are in an `n x 2` matrix `probs` (where `n` the number of
 rows of `Xnew`) then you return
 
 ```julia
-yhat = UnivariateFinite([:a, :b], probs, pool=an_element)
+yhat = MLJModelInterface.UnivariateFinite([:a, :b], probs, pool=an_element)
 ```
 
 This object automatically assigns zero-probability to the unseen class
@@ -650,11 +666,6 @@ vector). If you would like to assign `:rare` non-zero probabilities,
 simply add it to the first vector (the *support*) and supply a larger
 `probs` matrix.
 
-If instead of raw labels `[:a, :b]` you have the corresponding
-`CategoricalElement`s (from, e.g., `filter(cv->cv in unique(y),
-classes(y))`) then you can use these instead and drop the `pool`
-specifier.
-
 In a binary classification problem it suffices to specify a single
 vector of probabilities, provided you specify `augment=true`, as in
 the following example, *and note carefully that these probablities are
@@ -665,11 +676,12 @@ constructor:*
 y = categorical([:TRUE, :FALSE, :FALSE, :TRUE, :TRUE])
 an_element = y[1]
 probs = rand(10)
-yhat = UnivariateFinite([:FALSE, :TRUE], probs, augment=true, pool=an_element)
+yhat = MLJModelInterface.UnivariateFinite([:FALSE, :TRUE], probs, augment=true, pool=an_element)
 ```
 
 The constructor has a lot of options, including passing a dictionary
-instead of vectors. See [`UnivariateFinite`](@ref) for details.
+instead of vectors. See
+`CategoricalDistributions.UnivariateFinite`](@ref) for details.
 
 See
 [LinearBinaryClassifier](https://github.com/JuliaAI/MLJModels.jl/blob/master/src/GLM.jl)
@@ -874,8 +886,8 @@ generally relevant use-case is iterative models, where calls to
 increase the number of iterations only restarts the iterative
 procedure if other hyperparameters have also changed. (A useful method
 for inspecting model changes in such cases is
-`MLJModelInterface.is_same_except`. ) For an example, see the MLJ [ensemble
-code](https://github.com/alan-turing-institute/MLJ.jl/blob/master/src/ensembles.jl).
+`MLJModelInterface.is_same_except`. ) For an example, see
+[MLJEnsembles.jl](https://github.com/JuliaAI/MLJEnsembles.jl).
 
 A third use-case is to avoid repeating time-consuming preprocessing of
 `X` and `y` required by some models.
@@ -1194,7 +1206,11 @@ MLJModelInterface.int
 ```
 
 ```@docs
-MLJModelInterface.classes
+CategoricalDistributions.UnivariateFinite
+```
+
+```@docs
+CategoricalDistributions.classes
 ```
 
 ```@docs
@@ -1213,10 +1229,6 @@ MLJModelInterface.selectrows
 MLJModelInterface.selectcols
 ```
 
-```@docs
-UnivariateFinite
-```
-
 ```@docs
 MLJBase.recursive_getproperty
 MLJBase.recursive_setproperty!