Merge pull request #738 from alan-turing-institute/dev

For a 0.16 release
JuliaAI · Feb 8, 2021 · 7acb978 · 7acb978
2 parents c1f6bb9 + aee3e4f
commit 7acb978
Show file tree

Hide file tree

Showing 30 changed files with 560 additions and 430 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.0'
+          - '1.1'
           - '1' # automatically expands to the latest stable 1.x release of Julia.
         os:
           - ubuntu-latest
@@ -48,6 +48,8 @@ jobs:
   docs:
     name: Documentation
     runs-on: ubuntu-latest
+    env:
+      JULIA_PKG_SERVER: ""
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJ"
 uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.15.2"
+version = "0.16.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -24,14 +24,14 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 CategoricalArrays = "^0.8,^0.9"
 ComputationalResources = "^0.3"
 Distributions = "^0.21,^0.22,^0.23, 0.24"
-MLJBase = "^0.16"
-MLJModels = "^0.13"
+MLJBase = "^0.17"
+MLJModels = "^0.14"
 MLJScientificTypes = "^0.4.1"
 MLJTuning = "^0.6"
 ProgressMeter = "^1.1"
 StatsBase = "^0.32,^0.33"
 Tables = "^0.2,^1.0"
-julia = "1"
+julia = "^1.1"
 
 [extras]
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"

diff --git a/README.md b/README.md
@@ -117,7 +117,7 @@ appropriate version, activate your MLJ environment and run
 
 ```julia
   using Pkg;
-  Pkg.develop(PackageSpec(url="https://github.com/tlienart/OpenSpecFun_jll.jl"))
+  Pkg.add(PackageSpec(url="https://github.com/tlienart/OpenSpecFun_jll.jl"))
 ```
 
 #### Serialization for composite models with component models with custom serialization

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -7,15 +7,17 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJClusteringInterface = "d354fa79-ed1c-40d4-88ef-b8c7bd1568af"
 MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
 MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
+MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
 MLJMultivariateStatsInterface = "1b6a4a23-ba22-4f51-9698-8599985d3728"
 MLJScientificTypes = "2e2323e0-db8b-457b-ae0d-bdfb3bc63afd"
 MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
+NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"

diff --git a/docs/make.jl b/docs/make.jl
@@ -27,6 +27,7 @@ pages = [
     "Common MLJ Workflows" => "common_mlj_workflows.md",
     "Working with Categorical Data" => "working_with_categorical_data.md",
     "Model Search" => "model_search.md",
+    "Loading Model Code" => "loading_model_code.md",
     "Machines" => "machines.md",
     "Evaluating Model Performance" => "evaluating_model_performance.md",
     "Performance Measures" => "performance_measures.md",

diff --git a/docs/src/adding_models_for_general_use.md b/docs/src/adding_models_for_general_use.md
@@ -2,7 +2,7 @@
 
 !!! note
 
-    Models implementing the MLJ model interface according to the instructions given here should import MLJModelInterface version 0.3.5 or higher. This is enforced with a statement such as `MLJModelInterface = "^0.3.5" ` under `[compat]` in the Project.toml file of the package containing the implementation.
+    Models implementing the MLJ model interface according to the instructions given here should import MLJModelInterface version 0.3.7 or higher. This is enforced with a statement such as `MLJModelInterface = "^0.3.7" ` under `[compat]` in the Project.toml file of the package containing the implementation.
 
 This guide outlines the specification of the MLJ model interface
 and provides detailed guidelines for implementing the interface for
@@ -301,7 +301,8 @@ MMI.is_pure_julia(::Type{<:SomeSupervisedModel}) = false
 MMI.package_license(::Type{<:SomeSupervisedModel}) = "unknown"
 ```
 
-If `SomeSupervisedModel` supports sample weights, then instead of the `fit` above, one implements
+If `SomeSupervisedModel` supports sample weights or class weights,
+then instead of the `fit` above, one implements
 
 ```julia
 MMI.fit(model::SomeSupervisedModel, verbosity, X, y, w=nothing) -> fitresult, cache, report
@@ -320,6 +321,19 @@ Additionally, if `SomeSupervisedModel` supports sample weights, one must declare
 MMI.supports_weights(model::Type{<:SomeSupervisedModel}) = true
 ```
 
+Optionally, an implemenation may add a data front-end, for
+transforming user data (such as a table) into some model-specific
+format (such as a matrix), and for adding methods to specify how said
+format is resampled. (This alters the meaning of `X`, `y` and `w` in
+the signatures of `fit`, `update`, `predict`, etc; see [Implementing a
+data front-end](@ref) for details). This can provide the MLJ user
+certain performance advantages when fitting a machine.
+
+```julia
+MLJModelInterface.reformat(model::SomeSupervisedModel, args...) = args
+MLJModelInterface.selectrows(model::SomeSupervisedModel, I, data...) = data
+```
+
 Optionally, to customized support for serialization of machines (see
 [Serialization](@ref)), overload
 
@@ -409,7 +423,14 @@ MMI.fit(model::SomeSupervisedModel, verbosity, X, y) -> fitresult, cache, report
 
 It is not necessary for `fit` to provide type or dimension checks on
 `X` or `y` or to call `clean!` on the model; MLJ will carry out such
-checks.
+checks. 
+
+The types of `X` and `y` are constrained by the `input_scitype` and
+`target_scitype` trait declarations; see [Trait declarations](@ref)
+below. (That is, unless a data front-end is implemented, in which case
+these traits refer instead to the arguments of the overloaded
+`reformat` method, and the types of `X` and `y` are determined by the
+output of `reformat`.)
 
 The method `fit` should never alter hyperparameter values, the sole
 exception being fields of type `<:AbstractRNG`. If the package is able
@@ -843,6 +864,120 @@ additional information required (for example, pre-processed versions
 of `X` and `y`), as this is also passed as an argument to the `update`
 method.
 
+### Implementing a data front-end
+
+!!! note
+
+    It is suggested that packages implementing MLJ's model API, that later implement a data front-end, should tag their changes in a breaking release. (The changes will not break use of models for the ordinary MLJ user, who interacts with models exlusively through the machine interface. However, it will break usage for some external packages that have chosen to depend directly on the model API.)
+
+```julia
+MLJModelInterface.reformat(model, args...) -> data
+MLJModelInterface.selectrows(::Model, I, data...) -> sampled_data
+```
+
+Models optionally overload `reformat` to define transformations of
+user-supplied data into some model-specific representation (e.g., from
+a table to a matrix). Computational overheads associated with multiple
+`fit!`/`predict`/`transform` calls (on MLJ machines) are then avoided,
+when memory resources allow. The fallback returns `args` (no
+transformation). 
+
+The `selectrows(::Model, I, data...)` method is overloaded to specify
+how the model-specific data is to be subsampled, for some observation
+indices `I` (a colon, `:`, or instance of
+`AbstractVector{<:Integer}`). In this way, implementing a data
+front-end also allow more efficient resampling of data (in user calls
+to `evaluate!`).
+
+After detailing formal requirments for implementing a data front-end,
+we give a [Sample implementation](@ref). A simple implementation
+[implementation](https://github.com/Evovest/EvoTrees.jl/blob/94b58faf3042009bd609c9a5155a2e95486c2f0e/src/MLJ.jl#L23)
+also appears in the EvoTrees.jl package.
+
+Here "user-supplied data" is what the MLJ user supplies when
+constructing a machine, as in `machine(models, args...)`, which
+coincides with the arguments expected by `fit(model, verbosity,
+args...)` when `reformat` is not overloaded.
+
+Implementing a `reformat` data front-end is permitted for any `Model`
+subtype, except for subtypes of `Static`. Here is a complete list of
+responsibilities for such an implementation, for some
+`model::SomeModelType` (a sample implementation follows after):
+
+- A `reformat(model::SomeModelType, args...) -> data` method must be
+  implemented for each form of `args...` appearing in a valid machine
+  construction `machine(model, args...)` (there will be one for each
+  possible signature of `fit(::SomeModelType, ...)`).
+
+- Additionally, if not included above, there must be a single argument
+  form of reformat, `reformat(model::SommeModelType, arg) -> (data,)`,
+  serving as a data front-end for operations like `predict`. It must
+  always hold that `reformat(model, args...)[1] = reformat(model,
+  args[1])`.
+
+*Important.* `reformat(model::SomeModelType, args...)` must always
+  return a tuple of the same length as `args`, even if this is one.
+
+- `fit(model::SomeModelType, verbosity, data...)` should be
+  implemented as if `data` is the output of `reformat(model,
+  args...)`, where `args` is the data an MLJ user has bound to `model`
+  in some machine. The same applies to any overloading of `update`.
+
+- Each implemented operation, such as `predict` and `transform` - but
+  excluding `inverse_transform` - must be defined as if its data
+  arguments are `reformat`ed versions of user-supplied data. For
+  example, in the supervised case, `data_new` in
+  `predict(model::SomeModelType, fitresult, data_new)` is
+  `reformat(model, Xnew)`, where `Xnew` is the data provided by the MLJ
+  user in a call `predict(mach, Xnew)` (`mach.model == model`).
+
+- To specify how the model-specific representation of data is to be
+  resampled, implement `selectrows(model::SomeModelType, I, data...)
+  -> resampled_data` for each overloading of `reformat(model::SomeModel,
+  args...) -> data` above. Here `I` is an arbitrary abstract integer
+  vector or `:` (type `Colon`).
+
+*Important.* `selectrows(model::SomeModelType, I, args...)` must always
+return a tuple of the same length as `args`, even if this is one.
+
+The fallback for `selectrows` is described at [`selectrows`](@ref).
+
+
+#### Sample implementation
+
+Suppose a supervised model type `SomeSupervised` supports sample
+weights, leading to two different `fit` signatures, and that it has a
+single operation `predict`:
+
+    fit(model::SomeSupervised, verbosity, X, y)
+    fit(model::SomeSupervised, verbosity, X, y, w)
+
+    predict(model::SomeSupervised, fitresult, Xnew)
+
+Without a data front-end implemented, suppose `X` is expected to be a
+table and `y` a vector, but suppose the core algorithm always converts
+`X` to a matrix with features as rows (features corresponding to
+columns in the table).  Then a new data-front end might look like
+this:
+
+    constant MMI = MLJModelInterface
+
+    # for fit:
+    MMI.reformat(::SomeSupervised, X, y) = (MMI.matrix(X, transpose=true), y)
+    MMI.reformat(::SomeSupervised, X, y, w) = (MMI.matrix(X, transpose=true), y, w)
+    MMI.selectrows(::SomeSupervised, I, Xmatrix, y) =
+        (view(Xmatrix, :, I), view(y, I))
+    MMI.selectrows(::SomeSupervised, I, Xmatrix, y, w) =
+        (view(Xmatrix, :, I), view(y, I), view(w, I))
+
+    # for predict:
+    MMI.reformat(::SomeSupervised, X) = (MMI.matrix(X, transpose=true),)
+    MMI.selectrows(::SomeSupervised, I, Xmatrix) = view(Xmatrix, I)
+
+With these additions, `fit` and `predict` are refactored, so that `X`
+and `Xnew` represent matrices with features as rows.
+
+
 ### Supervised models with a `transform` method
 
 A supervised model may optionally implement a `transform` method,
@@ -1183,3 +1318,4 @@ add a model, you need to follow these steps
 
 - An administrator will then review your implementation and work with
   you to add the model to the registry
+>