diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..f162e37c7 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*ipynb linguist-vendored diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md new file mode 100644 index 000000000..4e5836616 --- /dev/null +++ b/BIBLIOGRAPHY.md @@ -0,0 +1,36 @@ +# Citing MLJ + +An overview of MLJ design: + + +[![DOI](https://joss.theoj.org/papers/10.21105/joss.02704/status.svg)](https://doi.org/10.21105/joss.02704) + +```bibtex +@article{Blaom2020, + doi = {10.21105/joss.02704}, + url = {https://doi.org/10.21105/joss.02704}, + year = {2020}, + publisher = {The Open Journal}, + volume = {5}, + number = {55}, + pages = {2704}, + author = {Anthony D. Blaom and Franz Kiraly and Thibaut Lienart and Yiannis Simillides and Diego Arenas and Sebastian J. Vollmer}, + title = {{MLJ}: A Julia package for composable machine learning}, + journal = {Journal of Open Source Software} +} +``` + +An in-depth view of MLJ's model composition design: + +[![arXiv](https://img.shields.io/badge/arXiv-2012.15505-.svg)](https://arxiv.org/abs/2012.15505) + +```bibtex +@misc{blaom2020flexible, + title={Flexible model composition in machine learning and its implementation in {MLJ}}, + author={Anthony D. Blaom and Sebastian J. Vollmer}, + year={2020}, + eprint={2012.15505}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} +``` diff --git a/ORGANIZATION.md b/ORGANIZATION.md index c72794980..ffb5636ff 100644 --- a/ORGANIZATION.md +++ b/ORGANIZATION.md @@ -13,18 +13,20 @@ its conventional use, are marked with a ⟂ symbol: evaluating and tuning machine learning models. It pulls in most code from other repositories described below. MLJ also hosts the [MLJ manual](src/docs) which documents functionality across the - repositories, with the exception of ScientificTypes, and + repositories, with the exception of ScientificTypesBase, and MLJScientific types which host their own documentation. (The MLJ manual and MLJTutorials do provide overviews of scientific types.) -* [MLJModelInterface.jl](https://github.com/alan-turing-institute/MLJModelInterface.jl) - is a lightweight package imported by packages implementing - MLJ's interface for their machine learning models. It's *sole* - dependency is ScientificTypes, which is a tiny package with *no* - dependencies. +* [MLJModelInterface.jl](https://github.com/JuliaAI/MLJModelInterface.jl) + is a lightweight package imported by packages implementing MLJ's + interface for their machine learning models. It's only dependencies + are ScientificTypesBase.jl (which depends only on the standard + library module `Random`) and + [StatisticalTraits.jl](https://github.com/JuliaAI/StatisticalTraits.jl) + (which depends only on ScientificTypesBase.jl). * (⟂) - [MLJBase.jl](https://github.com/alan-turing-institute/MLJBase.jl) is + [MLJBase.jl](https://github.com/JuliaAI/MLJBase.jl) is a large repository with two main purposes: (i) to give "dummy" methods defined in MLJModelInterface their intended functionality (which depends on third party packages, such as @@ -35,17 +37,17 @@ its conventional use, are marked with a ⟂ symbol: and (ii) provide functionality essential to the MLJ user that has not been relegated to its own "satellite" repository for some reason. See the [MLJBase.jl - readme](https://github.com/alan-turing-institute/MLJBase.jl) for a + readme](https://github.com/JuliaAI/MLJBase.jl) for a detailed description of MLJBase's contents. -* [MLJModels.jl](https://github.com/alan-turing-institute/MLJModels.jl) - hosts the MLJ **registry**, which contains metadata on all the +* [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) + hosts the *MLJ model registry*, which contains metadata on all the models the MLJ user can search and load from MLJ. Moreover, it provides the functionality for **loading model code** from MLJ on demand. Finally, it furnishes some commonly used transformers for data pre-processing, such as `ContinuousEncoder` and `Standardizer`. -* [MLJTuning.jl](https://github.com/alan-turing-institute/MLJTuning.jl) +* [MLJTuning.jl](https://github.com/JuliaAI/MLJTuning.jl) provides MLJ's `TunedModel` wrapper for hyper-parameter optimization, including the extendable API for tuning strategies, and selected in-house implementations, such as `Grid` and @@ -67,17 +69,18 @@ its conventional use, are marked with a ⟂ symbol: exchange platform * (⟂) - [MLJLinearModels.jl](https://github.com/alan-turing-institute/MLJLinearModels.jl) + [MLJLinearModels.jl](https://github.com/JuliaAI/MLJLinearModels.jl) is an experimental package for a wide range of julia-native penalized linear models such as Lasso, Elastic-Net, Robust regression, LAD regression, etc. -* [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl) an - experimental package for using **neural-network models**, built with +* [MLJFlux.jl](https://github.com/JuliaAI/MLJFlux.jl) an experimental + package for gradient-descent models, such as traditional + neural-networks, built with [Flux.jl](https://github.com/FluxML/Flux.jl), in MLJ. * (⟂) - [ScientificTypes.jl](https://github.com/alan-turing-institute/ScientificTypes.jl) + [ScientificTypesBase.jl](https://github.com/JuliaAI/ScientificTypesBase.jl) is an ultra lightweight package providing "scientific" types, such as `Continuous`, `OrderedFactor`, `Image` and `Table`. It's purpose is to formalize conventions around the scientific @@ -85,12 +88,12 @@ its conventional use, are marked with a ⟂ symbol: `DataFrame`. * (⟂) - [MLJScientificTypes.jl](https://github.com/alan-turing-institute/MLJScientificTypes.jl) + [ScientificTypes.jl](https://github.com/JuliaAI/ScientificTypes.jl) articulates MLJ's own convention for the scientific interpretation of data. * (⟂) - [StatisticalTraits.jl](https://github.com/alan-turing-institute/StatisticalTraits.jl) + [StatisticalTraits.jl](https://github.com/JuliaAI/StatisticalTraits.jl) An ultra lightweight package defining fall-back implementations for a collection of traits possessed by statistical objects. diff --git a/Project.toml b/Project.toml index ac10761c0..2049ba95d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJ" uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" authors = ["Anthony D. Blaom "] -version = "0.16.6" +version = "0.16.7" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -14,12 +14,12 @@ MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0" MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55" MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" MLJOpenML = "cbea4545-8c96-4583-ad3a-44078d60d369" -MLJScientificTypes = "2e2323e0-db8b-457b-ae0d-bdfb3bc63afd" MLJSerialization = "17bed46d-0ab5-4cd4-b792-a5c4b8547c6d" MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" @@ -33,10 +33,10 @@ MLJEnsembles = "0.1" MLJIteration = "0.3" MLJModels = "0.14" MLJOpenML = "1" -MLJScientificTypes = "0.4.1" MLJSerialization = "1.1" MLJTuning = "0.6" ProgressMeter = "1.1" +ScientificTypes = "2" StatsBase = "0.32,0.33" Tables = "0.2,1.0" julia = "1.3" diff --git a/README.md b/README.md index 384317ee0..584fc0938 100644 --- a/README.md +++ b/README.md @@ -12,87 +12,56 @@ Documentation - - - - - Binder - - DOI + + bibtex + + + bibtex

-**New to MLJ? Start [here](https://alan-turing-institute.github.io/MLJ.jl/dev/)**. - -**Wanting to integrate an existing machine learning model into the MLJ -framework? Start -[here](https://alan-turing-institute.github.io/MLJ.jl/dev/quick_start_guide_to_adding_models/)**. - -The remaining information on this page will be of interest primarily -to developers interested in contributing to core packages in the MLJ -ecosystem, whose organization is described further below. - MLJ (Machine Learning in Julia) is a toolbox written in Julia providing a common interface and meta-algorithms for selecting, -tuning, evaluating, composing and comparing over [150 machine -learning +tuning, evaluating, composing and comparing over [160 machine learning models](https://alan-turing-institute.github.io/MLJ.jl/dev/list_of_supported_models/) -written in Julia and other languages. MLJ is released under the MIT -license and sponsored by the [Alan Turing -Institute](https://www.turing.ac.uk/). +written in Julia and other languages. -
-

-MLJ Universe  •  -Known Issues  •  -Customizing Behavior  •  -Citing MLJ -

-
+**New to MLJ?** Start [here](https://alan-turing-institute.github.io/MLJ.jl/dev/). + +**Integrating an existing machine learning model into the MLJ +framework?** Start [here](https://alan-turing-institute.github.io/MLJ.jl/dev/quick_start_guide_to_adding_models/). + +MLJ was initially created as a Tools, +Practices and Systems project at the [Alan Turing +Institute](https://www.turing.ac.uk/) in 2019. Current funding is +provided by a [New Zealand Strategic Science Investment +Fund](https://www.mbie.govt.nz/science-and-technology/science-and-innovation/funding-information-and-opportunities/investment-funds/strategic-science-investment-fund/ssif-funded-programmes/university-of-auckland/). + +MLJ been developed with the support of the following organizations: + +
+ + + + + +
### The MLJ Universe The functionality of MLJ is distributed over a number of repositories -illustrated in the dependency chart below. - -
-

-Contributing  •  -Code Organization  •  -Road Map -
-
- MLJ  •  - MLJBase  •  - MLJModelInterface  •  - MLJModels  •  - MLJTuning  •  - MLJLinearModels  •  - MLJFlux -
-
- MLJTutorials  •  - MLJEnsembles  •  - MLJIteration  •  - MLJOpenML  •  - MLJSerialization -
-
- MLJScientificTypes  •  - ScientificTypes -

-

-
-

+illustrated in the dependency chart below. These repositories live at +the [JuliaAI](https://github.com/JuliaAI) umbrella organization.
Dependency Chart @@ -101,89 +70,20 @@ illustrated in the dependency chart below. *Dependency chart for MLJ repositories. Repositories with dashed connections do not currently exist but are planned/proposed.* - -### Known Issues - -#### ScikitLearn/MKL issue - -For users of Mac OS using Julia 1.3 or higher, using ScikitLearn -models can lead to unexpected MKL errors due to an issue not related -to MLJ. See -[this Julia Discourse discussion](https://discourse.julialang.org/t/julia-1-3-1-4-on-macos-and-intel-mkl-error/36469/2) -and -[this issue](https://github.com/JuliaPackaging/BinaryBuilder.jl/issues/700) -for context. - -A temporary workaround for this issue is to force the installation of -an older version of the `OpenSpecFun_jll` library. To install an -appropriate version, activate your MLJ environment and run - -```julia - using Pkg; - Pkg.add(PackageSpec(url="https://github.com/tlienart/OpenSpecFun_jll.jl")) -``` - -#### Serialization for composite models with component models with custom serialization - -See -[here](https://github.com/alan-turing-institute/MLJ.jl/issues/678). Workaround: -Instead of `XGBoost` models (the chief known case) use models from the -pure Julia package `EvoTrees`. - - -### Customizing behavior - -To customize behaviour of MLJ you will need to clone the relevant -component package (e.g., MLJBase.jl) - or a fork thereof - and modify -your local julia environment to use your local clone in place of the -official release. For example, you might proceed something like this: - -```julia -using Pkg -Pkg.activate("my_MLJ_enf", shared=true) -Pkg.develop("path/to/my/local/MLJBase") -``` - -To test your local clone, do - -```julia -Pkg.test("MLJBase") -``` - -For more on package management, see https://julialang.github.io/Pkg.jl/v1/ . - - - -### Citing MLJ - - -[![DOI](https://joss.theoj.org/papers/10.21105/joss.02704/status.svg)](https://doi.org/10.21105/joss.02704) - -```bibtex -@article{Blaom2020, - doi = {10.21105/joss.02704}, - url = {https://doi.org/10.21105/joss.02704}, - year = {2020}, - publisher = {The Open Journal}, - volume = {5}, - number = {55}, - pages = {2704}, - author = {Anthony D. Blaom and Franz Kiraly and Thibaut Lienart and Yiannis Simillides and Diego Arenas and Sebastian J. Vollmer}, - title = {{MLJ}: A Julia package for composable machine learning}, - journal = {Journal of Open Source Software} -} -``` +
+

+Contributing  •  +Code Organization  •  +Road Map +
#### Contributors *Core design*: A. Blaom, F. Kiraly, S. Vollmer -*Active maintainers*: A. Blaom, T. Lienart, S. Okon +*Lead contributor*: A. Blaom -*Active collaborators*: D. Arenas, D. Buchaca, J. Hoffimann, S. Okon, J. Samaroo, S. Vollmer +*Active maintainers*: A. Blaom, S. Okon, T. Lienart, D. Aluthge -*Past collaborators*: D. Aluthge, E. Barp, G. Bohner, M. K. Borregaard, V. Churavy, H. Devereux, M. Giordano, M. Innes, F. Kiraly, M. Nook, Z. Nugent, P. Oleśkiewicz, A. Shridar, Y. Simillides, A. Sengupta, A. Stechemesser. -#### License -MLJ is supported by the Alan Turing Institute and released under the MIT "Expat" License. diff --git a/ROADMAP.md b/ROADMAP.md index 3a93c384d..aeea17db5 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -42,13 +42,13 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). ### Adding models -- [ ] **Integrate deep learning** using [Flux.jl](https://github.com/FluxML/Flux.jl.git) deep learning. [Done](https://github.com/alan-turing-institute/MLJFlux.jl) but can +- [ ] **Integrate deep learning** using [Flux.jl](https://github.com/FluxML/Flux.jl.git) deep learning. [Done](https://github.com/JuliaAI/MLJFlux.jl) but can improve experience by: - [x] finishing iterative model wrapper [#139](https://github.com/alan-turing-institute/MLJ.jl/issues/139) - [ ] improving performance by implementing data front-end after (see [MLJBase - #501](https://github.com/alan-turing-institute/MLJBase.jl/pull/501)) but see also [this relevant discussion](https://github.com/FluxML/MLJFlux.jl/issues/97). + #501](https://github.com/JuliaAI/MLJBase.jl/pull/501)) but see also [this relevant discussion](https://github.com/FluxML/MLJFlux.jl/issues/97). - [ ] Probabilistic programming: @@ -66,7 +66,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). - [ ] Feature engineering (python featuretools?, recursive feature elimination?) - [#426](https://github.com/alan-turing-institute/MLJ.jl/issues/426) [MLJModels #314](https://github.com/alan-turing-institute/MLJModels.jl/issues/314) + [#426](https://github.com/alan-turing-institute/MLJ.jl/issues/426) [MLJModels #314](https://github.com/JuliaAI/MLJModels.jl/issues/314) ### Enhancing core functionality @@ -74,7 +74,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). - [x] Iterative model control [#139](https://github.com/alan-turing-institute/MLJ.jl/issues/139). [Done](https://github.com/JuliaAI/MLJIteration.jl) - [ ] **†** Add more tuning - strategies. See [here](https://github.com/alan-turing-institute/MLJTuning.jl#what-is-provided-here) + strategies. See [here](https://github.com/JuliaAI/MLJTuning.jl#what-is-provided-here) for complete wish-list. Particular focus on: @@ -83,7 +83,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). (done) - [x] Latin hypercube - [done](https://github.com/alan-turing-institute/MLJTuning.jl/pull/96) + [done](https://github.com/JuliaAI/MLJTuning.jl/pull/96) - [ ] Bayesian methods, starting with Gaussian Process methods a la PyMC3. Some preliminary research done . @@ -102,7 +102,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). - [ ] Particle Swarm Optization (current WIP, GSoC project @lhnguyen-vn) - [ ] tuning strategies for non-Cartesian spaces of models [MLJTuning - #18](https://github.com/alan-turing-institute/MLJTuning.jl/issues/18), architecture search, and other AutoML workflows + #18](https://github.com/JuliaAI/MLJTuning.jl/issues/18), architecture search, and other AutoML workflows - [ ] Systematic benchmarking, probably modeled on [MLaut](https://arxiv.org/abs/1901.03678) [#69](https://github.com/alan-turing-institute/MLJ.jl/issues/74) @@ -113,7 +113,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). - [ ] **†** Enhance complex model compostition: - - [ ] Introduce a canned + - [x] Introduce a canned stacking model wrapper ([POC](https://alan-turing-institute.github.io/DataScienceTutorials.jl/getting-started/stacking/)). WIP @olivierlabayle - [ ] Get rid of macros for creating pipelines and possibly @@ -139,9 +139,9 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). - [ ] Spin-off a stand-alone measures (loss functions) package (currently - [here](https://github.com/alan-turing-institute/MLJBase.jl/tree/master/src/measures)). Introduce + [here](https://github.com/JuliaAI/MLJBase.jl/tree/master/src/measures)). Introduce measures for multi-targets [MLJBase - #502](https://github.com/alan-turing-institute/MLJBase.jl/issues/502). + #502](https://github.com/JuliaAI/MLJBase.jl/issues/502). - [ ] Add sparse data support and better support for NLP models; we could use [NaiveBayes.jl](https://github.com/dfdx/NaiveBayes.jl) @@ -152,12 +152,12 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). - [x] POC for implementation of time series models classification [#303](https://github.com/alan-turing-institute/MLJ.jl/issues/303), - [ScientificTypes #14](https://github.com/alan-turing-institute/ScientificTypes.jl/issues/14) POC is [here](https://github.com/alan-turing-institute/TimeSeriesClassification.jl) + [ScientificTypesBase #14](https://github.com/JuliaAI/ScientificTypesBase.jl/issues/14) POC is [here](https://github.com/JuliaAI/TimeSeriesClassification.jl) - [ ] POC for time series forecasting, along lines of sktime; probably needs [MLJBase - #502](https://github.com/alan-turing-institute/MLJBase.jl/issues/502) + #502](https://github.com/JuliaAI/MLJBase.jl/issues/502) first, and someone to finish [PR on time series - CV](https://github.com/alan-turing-institute/MLJBase.jl/pull/331). See also [this proposal](https://julialang.org/jsoc/gsoc/MLJ/#time_series_forecasting_at_scale_-_speed_up_via_julia) + CV](https://github.com/JuliaAI/MLJBase.jl/pull/331). See also [this proposal](https://julialang.org/jsoc/gsoc/MLJ/#time_series_forecasting_at_scale_-_speed_up_via_julia) - [ ] Add tools or separate repository for visualization in MLJ. @@ -165,7 +165,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). [#85](https://github.com/alan-turing-institute/MLJ.jl/issues/85) (closed). [#416](https://github.com/alan-turing-institute/MLJ.jl/issues/416) - [Done](https://github.com/alan-turing-institute/MLJTuning.jl/pull/121) but might be worth adding alternatives suggested in issue. + [Done](https://github.com/JuliaAI/MLJTuning.jl/pull/121) but might be worth adding alternatives suggested in issue. - [ ] visualizing decision boundaries ? [#342](https://github.com/alan-turing-institute/MLJ.jl/issues/342) @@ -178,7 +178,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). - [x] missing value imputation using Gaussina Mixture Model. Done, via addition of BetaML model, `MissingImputator`. - - [ ] improve `autotype` method (from MLJScientificTypes), perhaps by + - [ ] improve `autotype` method (from ScientificTypes), perhaps by training on large collection of datasets with manually labelled scitype schema. @@ -190,7 +190,7 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). ### Scalability - [ ] Roll out data front-ends for all models after [MLJBase - #501](https://github.com/alan-turing-institute/MLJBase.jl/pull/501) + #501](https://github.com/JuliaAI/MLJBase.jl/pull/501) is merged. - [ ] Online learning support and distributed data @@ -204,5 +204,5 @@ list](https://github.com/alan-turing-institute/MLJ.jl/issues/673). [#71](https://github.com/alan-turing-institute/MLJ.jl/issues/71) - [x] Add multithreading to tuning [MLJTuning - #15](https://github.com/alan-turing-institute/MLJTuning.jl/issues/15) - [Done](https://github.com/alan-turing-institute/MLJTuning.jl/pull/42). + #15](https://github.com/JuliaAI/MLJTuning.jl/issues/15) + [Done](https://github.com/JuliaAI/MLJTuning.jl/pull/42). diff --git a/docs/Project.toml b/docs/Project.toml index 7c9152bb8..2a15d5ea6 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -20,7 +20,6 @@ MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" MLJMultivariateStatsInterface = "1b6a4a23-ba22-4f51-9698-8599985d3728" MLJOpenML = "cbea4545-8c96-4583-ad3a-44078d60d369" -MLJScientificTypes = "2e2323e0-db8b-457b-ae0d-bdfb3bc63afd" MLJSerialization = "17bed46d-0ab5-4cd4-b792-a5c4b8547c6d" MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" @@ -28,6 +27,7 @@ NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" +ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" @@ -37,7 +37,7 @@ MLJBase = "0.18" MLJEnsembles = "0.1" MLJIteration = "0.3" MLJModels = "0.14.4" -MLJScientificTypes = "0.4.6" MLJTuning = "0.6.5" -ScientificTypes = "1.1.1" +ScientificTypes = "2" +ScientificTypesBase = "1" julia = "1" diff --git a/docs/make.jl b/docs/make.jl index a24b8247e..13c25477a 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,9 +14,9 @@ import MLJTuning import MLJModels import MLJEnsembles import MLJOpenML -import MLJScientificTypes -import MLJModelInterface import ScientificTypes +import MLJModelInterface +import ScientificTypesBase import Distributions using CategoricalArrays # avoid types like CategoricalArrays.Categorica using LossFunctions @@ -51,12 +51,13 @@ pages = [ "Quick-Start Guide to Adding Models" => "quick_start_guide_to_adding_models.md", "Adding Models for General Use" => "adding_models_for_general_use.md", - "Benchmarking" => "benchmarking.md", + "Customizing Behavior" => "customizing_behavior.md", "Internals" => "internals.md", "List of Supported Models" => "list_of_supported_models.md", "Third Party Packages" => "third_party_packages.md", "Glossary" => "glossary.md", "MLJ Cheatsheet" => "mlj_cheatsheet.md", + "Known Issues" => "known_issues.md", "FAQ" => "frequently_asked_questions.md", "Julia BlogPost" => "julia_blogpost.md", "Index of Methods" => "api.md", @@ -74,9 +75,9 @@ makedocs( MLJTuning, MLJModels, MLJEnsembles, - MLJScientificTypes, - MLJModelInterface, ScientificTypes, + MLJModelInterface, + ScientificTypesBase, MLJIteration, MLJSerialization, MLJOpenML, diff --git a/docs/src/adding_models_for_general_use.md b/docs/src/adding_models_for_general_use.md index a46011e2a..f528a5bb5 100755 --- a/docs/src/adding_models_for_general_use.md +++ b/docs/src/adding_models_for_general_use.md @@ -10,11 +10,11 @@ models intended for general use. See also the more condensed [Quick-Start Guide to Adding Models](@ref). For sample implementations, see -[MLJModels/src](https://github.com/alan-turing-institute/MLJModels.jl/tree/master/src). +[MLJModels/src](https://github.com/JuliaAI/MLJModels.jl/tree/master/src). The machine learning tools provided by MLJ can be applied to the models in any package that imports the package -[MLJModelInterface](https://github.com/alan-turing-institute/MLJModelInterface.jl) and +[MLJModelInterface](https://github.com/JuliaAI/MLJModelInterface.jl) and implements the API defined there, as outlined below. For a quick-and-dirty implementation of user-defined models see [Simple User Defined Models](simple_user_defined_models.md). To make new models @@ -24,14 +24,14 @@ models](@ref). #### Important -[MLJModelInterface](https://github.com/alan-turing-institute/MLJModelInterface.jl) +[MLJModelInterface](https://github.com/JuliaAI/MLJModelInterface.jl) is a very light-weight interface allowing you to *define* your interface, but does not provide the functionality required to use or test your interface; this requires -[MLJBase](https://github.com/alan-turing-institute/MLJBase.jl). So, +[MLJBase](https://github.com/JuliaAI/MLJBase.jl). So, while you only need to add `MLJModelInterface` to your project's [deps], for testing purposes you need to add -[MLJBase](https://github.com/alan-turing-institute/MLJBase.jl) to your +[MLJBase](https://github.com/JuliaAI/MLJBase.jl) to your project's [extras] and [targets]. In testing, simply use `MLJBase` in place of `MLJModelInterface`. @@ -39,7 +39,7 @@ It is assumed the reader has read [Getting Started](index.md). To implement the API described here, some familiarity with the following packages is also helpful: -- [MLJScientificTypes.jl](https://github.com/alan-turing-institute/MLJScientificTypes.jl) +- [ScientificTypes.jl](https://github.com/JuliaAI/ScientificTypes.jl) (for specifying model requirements of data) - [Distributions.jl](https://github.com/JuliaStats/Distributions.jl) @@ -185,7 +185,7 @@ parameters. #### Known issue with @mlj_macro Defaults with negative values can trip up the `@mlj_macro` (see [this -issue](https://github.com/alan-turing-institute/MLJBase.jl/issues/68)). So, +issue](https://github.com/JuliaAI/MLJBase.jl/issues/68)). So, for example, this does not work: ```julia @@ -549,7 +549,7 @@ end ``` For a concrete example, refer to the -[code](https://github.com/alan-turing-institute/MLJModels.jl/blob/master/src/ScikitLearn.jl) +[code](https://github.com/JuliaAI/MLJModels.jl/blob/master/src/ScikitLearn.jl) for `SVMClassifier`. Of course, if you are coding a learning algorithm from scratch, rather @@ -624,7 +624,7 @@ The constructor has a lot of options, including passing a dictionary instead of vectors. See [`UnivariateFinite`](@ref) for details. See -[LinearBinaryClassifier](https://github.com/alan-turing-institute/MLJModels.jl/blob/master/src/GLM.jl) +[LinearBinaryClassifier](https://github.com/JuliaAI/MLJModels.jl/blob/master/src/GLM.jl) for an example of a Probabilistic classifier implementation. *Important note on binary classifiers.* There is no "Binary" scitype @@ -634,7 +634,7 @@ an alias for `Union{Multiclass{2},OrderedFactor{2}}`. The `AbstractVector{<:Binary}` and according to the *mlj* scitype convention, elements of `y` have type `CategoricalValue`, and *not* `Bool`. See -[BinaryClassifier](https://github.com/alan-turing-institute/MLJModels.jl/blob/master/src/GLM.jl) +[BinaryClassifier](https://github.com/JuliaAI/MLJModels.jl/blob/master/src/GLM.jl) for an example. @@ -672,7 +672,7 @@ attempt to use your model with inappropriately typed data. The trait functions `input_scitype` and `target_scitype` take scientific data types as values. We assume here familiarity with -[MLJScientificTypes.jl](https://github.com/alan-turing-institute/MLJScientificTypes.jl) +[ScientificTypes.jl](https://github.com/JuliaAI/ScientificTypes.jl) (see [Getting Started](index.md) for the basics). For example, to ensure that the `X` presented to the @@ -755,7 +755,7 @@ method | return type | declarable return values Here is the complete list of trait function declarations for `DecisionTreeClassifier`, whose core algorithms are provided by DecisionTree.jl, but whose interface actually lives at -[MLJDecisionTreeInterface.jl](https://github.com/alan-turing-institute/MLJDecisionTreeInterface.jl). +[MLJDecisionTreeInterface.jl](https://github.com/JuliaAI/MLJDecisionTreeInterface.jl). ```julia MMI.input_scitype(::Type{<:DecisionTreeClassifier}) = MMI.Table(MMI.Continuous) @@ -991,7 +991,7 @@ A working implementation of a model that fits a `UnivariateFinite` distribution to some categorical data using [Laplace smoothing](https://en.wikipedia.org/wiki/Additive_smoothing) controlled by a hyper-parameter `alpha` is given -[here](https://github.com/alan-turing-institute/MLJBase.jl/blob/d377bee1198ec179a4ade191c11fef583854af4a/test/interface/model_api.jl#L36). +[here](https://github.com/JuliaAI/MLJBase.jl/blob/d377bee1198ec179a4ade191c11fef583854af4a/test/interface/model_api.jl#L36). ### Serialization @@ -1193,26 +1193,26 @@ to all MLJ users: 2. **Separate interface package**. Implementation code lives in a separate *interface package*, which has the algorithm providing package as a dependency. An example is - [MLJDecisionTreeInterface.jl](https://github.com/alan-turing-institute/MLJDecisionTreeInterface.jl), + [MLJDecisionTreeInterface.jl](https://github.com/JuliaAI/MLJDecisionTreeInterface.jl), which provides the interface for models in [DecisionTree.jl](https://github.com/bensadeghi/DecisionTree.jl). Additionally, one needs to ensure that the implementation code defines the `package_name` and `load_path` model traits appropriately, so that `MLJ`'s `@load` macro can find the necessary code (see -[MLJModels/src](https://github.com/alan-turing-institute/MLJModels.jl/tree/master/src) +[MLJModels/src](https://github.com/JuliaAI/MLJModels.jl/tree/master/src) for examples). ### How to add models to the MLJ model registry? The MLJ model registry is located in the [MLJModels.jl -repository](https://github.com/alan-turing-institute/MLJModels.jl). To +repository](https://github.com/JuliaAI/MLJModels.jl). To add a model, you need to follow these steps - Ensure your model conforms to the interface defined above - Raise an issue at - [MLJModels.jl](https://github.com/alan-turing-institute/MLJModels.jl/issues) + [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl/issues) and point out where the MLJ-interface implementation is, e.g. by providing a link to the code. diff --git a/docs/src/composing_models.md b/docs/src/composing_models.md index b94206fcd..f684a6ab6 100644 --- a/docs/src/composing_models.md +++ b/docs/src/composing_models.md @@ -21,7 +21,7 @@ composition in machine learning and its implementation in MLJ. Preprint, arXiv:2012.15505](https://arxiv.org/abs/2012.15505). -## Linear pipelines +## Linear Pipelines In MLJ a *pipeline* is a composite model in which models are chained together in a linear (non-branching) chain. Pipelines can include diff --git a/docs/src/customizing_behavior.md b/docs/src/customizing_behavior.md new file mode 100644 index 000000000..a9bda58f1 --- /dev/null +++ b/docs/src/customizing_behavior.md @@ -0,0 +1,21 @@ +# Customizing Behavior + +To customize behaviour of MLJ you will need to clone the relevant +component package (e.g., MLJBase.jl) - or a fork thereof - and modify +your local julia environment to use your local clone in place of the +official release. For example, you might proceed something like this: + +```julia +using Pkg +Pkg.activate("my_MLJ_enf", shared=true) +Pkg.develop("path/to/my/local/MLJBase") +``` + +To test your local clone, do + +```julia +Pkg.test("MLJBase") +``` + +For more on package management, see [here](https://julialang.github.io/Pkg.jl/v1/). + diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index 65010b335..e208a407a 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -46,7 +46,7 @@ models(matching(X,y)) In MLJ a *model* is a struct storing the hyperparameters of the learning algorithm indicated by the struct name (and nothing else). For common problems matching data to models, see [Model -Search](@ref) and [Preparing Data](@ref). +Search](@ref model_search) and [Preparing Data](@ref). Assuming the MLJDecisionTreeInterface.jl package is in your load path (see [Installation](@ref)) we can use `@load` to import the @@ -259,12 +259,12 @@ use of MLJ.* The MLJ user should acquaint themselves with some basic assumptions about the form of data expected by MLJ, as outlined below. The basic -`machine` constructions look like this (see also [Constructing +`machine` constructors look like this (see also [Constructing machines](@ref)): ``` -machine(model::Supervised, X, y) machine(model::Unsupervised, X) +machine(model::Supervised, X, y) ``` Each supervised model in MLJ declares the permitted *scientific type* @@ -274,9 +274,9 @@ as `Array{Float32, 2}`). Similar remarks apply to the input `X` of an unsupervised model. Scientific types are julia types defined in the package -[ScientificTypes.jl](https://github.com/alan-turing-institute/ScientificTypes.jl); +[ScientificTypesBase.jl](https://github.com/JuliaAI/ScientificTypesBase.jl); the package -[MLJScientificTypes.jl](https://alan-turing-institute.github.io/MLJScientificTypes.jl/dev/) +[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/) implements the particular convention used in the MLJ universe for assigning a specific scientific type (interpretation) to each julia object (see the `scitype` examples below). @@ -286,7 +286,7 @@ The basic "scalar" scientific types are `Continuous`, `Multiclass{N}`, scientific types](@ref) below to guarantee your scalar data is interpreted correctly. Tools exist to coerce the data to have the appropriate scientfic type; see -[MLJScientificTypes.jl](https://alan-turing-institute.github.io/MLJScientificTypes.jl/dev/) +[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/) or run `?coerce` for details. Additionally, most data containers - such as tuples, vectors, matrices @@ -296,7 +296,7 @@ and tables - have a scientific type. ![](img/scitypes.png) *Figure 1. Part of the scientific type hierarchy in* -[ScientificTypes.jl](https://alan-turing-institute.github.io/MLJScientificTypes.jl/dev/). +[ScientificTypesBase.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/). ```@repl doda scitype(4.6) @@ -384,12 +384,12 @@ i.input_scitype i.target_scitype ``` -But see also [Model Search](@ref). +But see also [Model Search](@ref model_search). ### Scalar scientific types Models in MLJ will always apply the `MLJ` convention described in -[MLJScientificTypes.jl](https://alan-turing-institute.github.io/MLJScientificTypes.jl/dev/) +[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/) to decide how to interpret the elements of your container types. Here are the key features of that convention: @@ -417,7 +417,7 @@ appropriate `Finite` (categorical) scitype. See [Working with Categorical Data] For more on scitype coercion of arrays and tables, see [`coerce`](@ref), [`autotype`](@ref) and [`unpack`](@ref) below and the examples at -[MLJScientificTypes.jl](https://alan-turing-institute.github.io/MLJScientificTypes.jl/dev/). +[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/). diff --git a/docs/src/index.md b/docs/src/index.md index c3384c239..237be965e 100755 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -186,7 +186,7 @@ installation required. * Consistent interface to handle probabilistic predictions. * Extensible [tuning - interface](https://github.com/alan-turing-institute/MLJTuning.jl), + interface](https://github.com/JuliaAI/MLJTuning.jl), to support growing number of optimization strategies, and designed to play well with model composition. @@ -245,8 +245,7 @@ on the `#mlj` slack workspace in the Julia Slack channel. Bugs, suggestions, and feature requests can be posted [here](https://github.com/alan-turing-institute/MLJ.jl/issues). -For known issues that are not strictly MLJ bugs, see -[here](https://github.com/alan-turing-institute/MLJ.jl#known-issues) +See also, [Known Issues](@ref) ## Installation @@ -286,8 +285,8 @@ julia> tree = Tree() # instance where you will also be asked to choose a providing package, for more than one provide a `DecisionTreeClassifier` model. For more on -identifying the name of an applicable model, see [Model -Search](@ref). For non-interactive loading of code (e.g., from a +identifying the name of an applicable model, see [Model Search](@ref model_search). +For non-interactive loading of code (e.g., from a module or function) see [Loading Model Code](@ref). It is recommended that you start with models from more mature @@ -296,7 +295,10 @@ packages such as DecisionTree.jl, ScikitLearn.jl or XGBoost.jl. MLJ is supported by a number of satellite packages (MLJTuning, MLJModelInterface, etc) which the general user is *not* required to install directly. Developers can learn more about these -[here](https://github.com/alan-turing-institute/MLJ.jl/blob/master/ORGANIZATION.md) +[here](https://github.com/alan-turing-institute/MLJ.jl/blob/master/ORGANIZATION.md). + +See also the alternative instalation instructions for [Customizing +Behavior](@ref). ## Learning Julia @@ -338,8 +340,8 @@ questions and make suggestions. ## Citing MLJ -When presenting work that uses MLJ, please cite the MLJ design -paper: +An overview of MLJ design: + [![DOI](https://joss.theoj.org/papers/10.21105/joss.02704/status.svg)](https://doi.org/10.21105/joss.02704) @@ -358,16 +360,17 @@ paper: } ``` -If using the model composition features of MLJ (learning networks) -please additionally cite +An in-depth view of MLJ's model composition design: -```bitex +[![arXiv](https://img.shields.io/badge/arXiv-2012.15505-.svg)](https://arxiv.org/abs/2012.15505) + +```bibtex @misc{blaom2020flexible, - title={{Flexible model composition in machine learning and its implementation in MLJ}}, - author={Anthony D. Blaom and Sebastian J. Vollmer}, - year={2020}, - eprint={2012.15505}, - archivePrefix={arXiv}, - primaryClass={cs.LG} + title={Flexible model composition in machine learning and its implementation in {MLJ}}, + author={Anthony D. Blaom and Sebastian J. Vollmer}, + year={2020}, + eprint={2012.15505}, + archivePrefix={arXiv}, + primaryClass={cs.LG} } ``` diff --git a/docs/src/julia_blogpost.md b/docs/src/julia_blogpost.md index 24efa68cd..7e1c86739 100644 --- a/docs/src/julia_blogpost.md +++ b/docs/src/julia_blogpost.md @@ -40,7 +40,7 @@ composition. - Video from [London Julia User Group meetup in March 2019](https://www.youtube.com/watch?v=CfHkjNmj1eE) (skip to [demo at 21'39](https://youtu.be/CfHkjNmj1eE?t=21m39s))   -- [MLJ Tutorials](https://alan-turing-institute.github.io/MLJTutorials/) +- [MLJ Tutorials](https://JuliaAI.github.io/MLJTutorials/) - Implementing the MLJ interface for a [new model](https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/) @@ -224,4 +224,4 @@ read. In this respect we have been inspired by [On Machine Learning and Programming Languages](https://julialang.org/blog/2017/12/ml&pl). ## Invitation to the community -We now invite the community to try out our newly registered packages, [MLJ](https://github.com/alan-turing-institute/MLJ.jl)alongside [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl), and provide any feedback or suggestions you may have going forward. We are also particularly interested in hearing how you would use our package, and what features it may be lacking. +We now invite the community to try out our newly registered packages, [MLJ](https://github.com/alan-turing-institute/MLJ.jl)alongside [MLJModels](https://github.com/JuliaAI/MLJModels.jl), and provide any feedback or suggestions you may have going forward. We are also particularly interested in hearing how you would use our package, and what features it may be lacking. diff --git a/docs/src/known_issues.md b/docs/src/known_issues.md new file mode 100644 index 000000000..807127dd9 --- /dev/null +++ b/docs/src/known_issues.md @@ -0,0 +1,32 @@ +# Known Issues + +Routine issues are posted +[here](https://github.com/alan-turing-institute/MLJ.jl/issues). Below +are some longer term issues and limitations. + +#### ScikitLearn/MKL issue + +For users of Mac OS using Julia 1.3 or higher, using ScikitLearn +models can lead to unexpected MKL errors due to an issue not related +to MLJ. See +[this Julia Discourse discussion](https://discourse.julialang.org/t/julia-1-3-1-4-on-macos-and-intel-mkl-error/36469/2) +and +[this issue](https://github.com/JuliaPackaging/BinaryBuilder.jl/issues/700) +for context. + +A temporary workaround for this issue is to force the installation of +an older version of the `OpenSpecFun_jll` library. To install an +appropriate version, activate your MLJ environment and run + +```julia + using Pkg; + Pkg.add(PackageSpec(url="https://github.com/tlienart/OpenSpecFun_jll.jl")) +``` + +#### Serialization for composite models with component models with custom serialization + +See +[here](https://github.com/alan-turing-institute/MLJ.jl/issues/678). Workaround: +Instead of `XGBoost` models (the chief known case) use models from the +pure Julia package `EvoTrees`. + diff --git a/docs/src/list_of_supported_models.md b/docs/src/list_of_supported_models.md index f545f8aef..7f91b4d09 100644 --- a/docs/src/list_of_supported_models.md +++ b/docs/src/list_of_supported_models.md @@ -24,12 +24,12 @@ models()`. [GLM.jl](https://github.com/JuliaStats/GLM.jl) | LinearRegressor, LinearBinaryClassifier, LinearCountRegressor | medium | † [LIBSVM.jl](https://github.com/mpastell/LIBSVM.jl) | LinearSVC, SVC, NuSVC, NuSVR, EpsilonSVR, OneClassSVM | high | also via ScikitLearn.jl [LightGBM.jl](https://github.com/IQVIA-ML/LightGBM.jl) | LightGBMClassifier, LightGBMRegressor | high | -[MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl) | NeuralNetworkRegressor, NeuralNetworkClassifier, MultitargetNeuralNetworkRegressor, ImageClassifier | experimental | -[MLJLinearModels.jl](https://github.com/alan-turing-institute/MLJLinearModels.jl) | LinearRegressor, RidgeRegressor, LassoRegressor, ElasticNetRegressor, QuantileRegressor, HuberRegressor, RobustRegressor, LADRegressor, LogisticClassifier, MultinomialClassifier | experimental | -[MLJModels.jl](https://github.com/alan-turing-institute/MLJModels.jl) (built-in) | StaticTransformer, FeatureSelector, FillImputer, UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer, OneHotEncoder, ContinuousEncoder, ConstantRegressor, ConstantClassifier, BinaryThreshholdPredictor | medium | +[MLJFlux.jl](https://github.com/JuliaAI/MLJFlux.jl) | NeuralNetworkRegressor, NeuralNetworkClassifier, MultitargetNeuralNetworkRegressor, ImageClassifier | experimental | +[MLJLinearModels.jl](https://github.com/JuliaAI/MLJLinearModels.jl) | LinearRegressor, RidgeRegressor, LassoRegressor, ElasticNetRegressor, QuantileRegressor, HuberRegressor, RobustRegressor, LADRegressor, LogisticClassifier, MultinomialClassifier | experimental | +[MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) (built-in) | StaticTransformer, FeatureSelector, FillImputer, UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer, OneHotEncoder, ContinuousEncoder, ConstantRegressor, ConstantClassifier, BinaryThreshholdPredictor | medium | [MultivariateStats.jl](https://github.com/JuliaStats/MultivariateStats.jl) | LinearRegressor, MultitargetLinearRegressor, RidgeRegressor, MultitargetRidgeRegressor, PCA, KernelPCA, ICA, LDA, BayesianLDA, SubspaceLDA, BayesianSubspaceLDA, FactorAnalysis, PPCA | high | [NaiveBayes.jl](https://github.com/dfdx/NaiveBayes.jl) | GaussianNBClassifier, MultinomialNBClassifier, HybridNBClassifier | experimental | -[NearestNeighborModels.jl](https://github.com/alan-turing-institute/NearestNeighborModels.jl) | KNNClassifier, KNNRegressor, MultitargetKNNClassifier, MultitargetKNNRegressor | high | +[NearestNeighborModels.jl](https://github.com/JuliaAI/NearestNeighborModels.jl) | KNNClassifier, KNNRegressor, MultitargetKNNClassifier, MultitargetKNNRegressor | high | [ParallelKMeans.jl](https://github.com/PyDataBlog/ParallelKMeans.jl) | KMeans | experimental | [PartialLeastSquaresRegressor.jl](https://github.com/lalvim/PartialLeastSquaresRegressor.jl) | PLSRegressor, KPLSRegressor | experimental | [ScikitLearn.jl](https://github.com/cstjean/ScikitLearn.jl) | ARDRegressor, AdaBoostClassifier, AdaBoostRegressor, AffinityPropagation, AgglomerativeClustering, BaggingClassifier, BaggingRegressor, BayesianLDA, BayesianQDA, BayesianRidgeRegressor, BernoulliNBClassifier, Birch, ComplementNBClassifier, DBSCAN, DummyClassifier, DummyRegressor, ElasticNetCVRegressor, ElasticNetRegressor, ExtraTreesClassifier, ExtraTreesRegressor, FeatureAgglomeration, GaussianNBClassifier, GaussianProcessClassifier, GaussianProcessRegressor, GradientBoostingClassifier, GradientBoostingRegressor, HuberRegressor, KMeans, KNeighborsClassifier, KNeighborsRegressor, LarsCVRegressor, LarsRegressor, LassoCVRegressor, LassoLarsCVRegressor, LassoLarsICRegressor, LassoLarsRegressor, LassoRegressor, LinearRegressor, LogisticCVClassifier, LogisticClassifier, MeanShift, MiniBatchKMeans, MultiTaskElasticNetCVRegressor, MultiTaskElasticNetRegressor, MultiTaskLassoCVRegressor, MultiTaskLassoRegressor, MultinomialNBClassifier, OPTICS, OrthogonalMatchingPursuitCVRegressor, OrthogonalMatchingPursuitRegressor, PassiveAggressiveClassifier, PassiveAggressiveRegressor, PerceptronClassifier, ProbabilisticSGDClassifier, RANSACRegressor, RandomForestClassifier, RandomForestRegressor, RidgeCVClassifier, RidgeCVRegressor, RidgeClassifier, RidgeRegressor, SGDClassifier, SGDRegressor, SVMClassifier, SVMLClassifier, SVMLRegressor, SVMNuClassifier, SVMNuRegressor, SVMRegressor, SpectralClustering, TheilSenRegressor | high | † diff --git a/docs/src/loading_model_code.md b/docs/src/loading_model_code.md index 16ebe4ae6..f49f5e377 100644 --- a/docs/src/loading_model_code.md +++ b/docs/src/loading_model_code.md @@ -1,12 +1,12 @@ # Loading Model Code Once the name of a model, and the package providing that model, have -been identified (see [Model Search](@ref)) one can either import the -model type interactively with `@iload`, as shown under -[Installation](@ref), or use `@load` as shown below. The `@load` -macro works from within a module, a package or a function, provided -the relevant package providing the MLJ interface has been added to -your package environment. +been identified (see [Model Search](@ref model_search)) one can either +import the model type interactively with `@iload`, as shown under +[Installation](@ref), or use `@load` as shown below. The `@load` macro +works from within a module, a package or a function, provided the +relevant package providing the MLJ interface has been added to your +package environment. In general, the code providing core functionality for the model (living in a packge you should consult for documentation) may be diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index 77576d95f..6458f4c08 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -10,7 +10,7 @@ To see list all measures, run `measures()`. Further measures for probabilistic predictors, such as proper scoring rules, and for constructing multi-target product measures, are planned. If you'd like to see measure added to MLJ, post a comment -[here](https://github.com/alan-turing-institute/MLJBase.jl/issues/299) +[here](https://github.com/JuliaAI/MLJBase.jl/issues/299).g *Note for developers:* The measures interface and the built-in measures described here are defined in MLJBase, but will ultimately live @@ -51,7 +51,7 @@ log_loss(ŷ, y) ``` The measures `rms`, `l2` and `log_loss` illustrated here are actually - instances of measure *types*. For, example, `l2 = LPLoss(p=2)` and + instances of measure *types*. For, example, `l2 = LPLoss(p=2)` and `log_loss = LogLoss() = LogLoss(tol=eps())`. Common aliases are provided: @@ -152,16 +152,21 @@ predictions. ## List of measures +All measures listed below have a doc-string associated with the measure's +*type*. So, for example, do `?LPLoss` not `?l2`. + ```@setup losses_and_scores using DataFrames ``` ```@example losses_and_scores ms = measures() -types = map(ms) do m m.name end +types = map(ms) do m + m.name +end instance = map(ms) do m m.instances end -t = (type=types, instances=instance) -DataFrame(t) +table = (type=types, instances=instance) +DataFrame(table) ``` diff --git a/docs/src/preparing_data.md b/docs/src/preparing_data.md index 6ecebb31b..104f7bcdf 100644 --- a/docs/src/preparing_data.md +++ b/docs/src/preparing_data.md @@ -1,7 +1,7 @@ # Preparing Data As outlined in [Getting Started](@ref), it is important that the -[scientific type](https://github.com/JuliaAI/ScientificTypes.jl) of +[scientific type](https://github.com/JuliaAI/ScientificTypesBase.jl) of data matches the requirements of the model of interest. For example, while the majority of supervised learning models require input features to be `Continuous`, newcomers to MLJ are sometimes @@ -93,7 +93,7 @@ Pipelines](@ref). ## Scientific type coercion Scientific type coercion is documented in detail at -[ScientificTypes.jl](https://github.com/JuliaAI/ScientificTypes.jl). See +[ScientificTypesBase.jl](https://github.com/JuliaAI/ScientificTypesBase.jl). See also the tutorial at the [this MLJ Workshop](https://github.com/ablaom/MachineLearningInJulia2020) (specifically, @@ -114,7 +114,7 @@ MLJ's Built-in transformers are documented at [Transformers and other with ```julia -MissingImputator = @load MissingImputator pkg=BetaML`. +MissingImputator = @load MissingImputator pkg=BetaML ``` [This MLJ diff --git a/docs/src/quick_start_guide_to_adding_models.md b/docs/src/quick_start_guide_to_adding_models.md index 8e3e4fe91..3282e0ee2 100644 --- a/docs/src/quick_start_guide_to_adding_models.md +++ b/docs/src/quick_start_guide_to_adding_models.md @@ -8,7 +8,7 @@ learning models; (ii) that you would like to interface and register these models with MLJ; and (iii) that you have a rough understanding of how things work with MLJ. In particular you are familiar with: -- what [scientific types](https://github.com/alan-turing-institute/MLJScientificTypes.jl) are +- what [scientific types](https://github.com/JuliaAI/ScientificTypes.jl) are - what `Probabilistic`, `Deterministic` and `Unsupervised` models are @@ -53,14 +53,14 @@ includes: #### Important -[MLJModelInterface](https://github.com/alan-turing-institute/MLJModelInterface.jl) +[MLJModelInterface](https://github.com/JuliaAI/MLJModelInterface.jl) is a very light-weight interface allowing you to *define* your interface, but does not provide the functionality required to use or test your interface; this requires -[MLJBase](https://github.com/alan-turing-institute/MLJBase.jl). So, +[MLJBase](https://github.com/JuliaAI/MLJBase.jl). So, while you only need to add `MLJModelInterface` to your project's [deps], for testing purposes you need to add -[MLJBase](https://github.com/alan-turing-institute/MLJBase.jl) to your +[MLJBase](https://github.com/JuliaAI/MLJBase.jl) to your project's [extras] and [targets]. In testing, simply use `MLJBase` in place of `MLJModelInterface`. @@ -100,7 +100,7 @@ the field `a` is a `Float64`, takes `0.5` as default value, and expects its value to be positive. Please see [this -issue](https://github.com/alan-turing-institute/MLJBase.jl/issues/68) +issue](https://github.com/JuliaAI/MLJBase.jl/issues/68) for a known issue and workaround relating to the use of `@mlj_model` with negative defaults. @@ -148,8 +148,8 @@ end **Examples**: -- [KNNClassifier](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/NearestNeighbors.jl#L62-L69) which uses `@mlj_model`, -- [XGBoostRegressor](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/XGBoost.jl#L17-L161) which does not. +- [KNNClassifier](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/NearestNeighbors.jl#L62-L69) which uses `@mlj_model`, +- [XGBoostRegressor](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/XGBoost.jl#L17-L161) which does not. ### Fit @@ -191,7 +191,7 @@ The `report` should be a `NamedTuple` with any auxiliary useful information that a user would want to know about the fit (e.g., feature rankings). See more on this below. -**Example**: GLM's [LinearRegressor](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L95-L105) +**Example**: GLM's [LinearRegressor](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L95-L105) #### Classifier @@ -222,16 +222,16 @@ detailed instructions at [The predict method](@ref). **Examples**: -- GLM's [BinaryClassifier](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L119-L131) (`Probabilistic`) +- GLM's [BinaryClassifier](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L119-L131) (`Probabilistic`) -- LIBSVM's [SVC](https://github.com/alan-turing-institute/MLJModels.jl/blob/master/src/LIBSVM.jl) (`Deterministic`) +- LIBSVM's [SVC](https://github.com/JuliaAI/MLJModels.jl/blob/master/src/LIBSVM.jl) (`Deterministic`) #### Transformer Nothing special for a transformer. -**Example**: [FillImputer](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/builtins/Transformers.jl#L54-L64) +**Example**: [FillImputer](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/builtins/Transformers.jl#L54-L64) ### Fitted parameters @@ -254,7 +254,7 @@ function MLJModelInterface.fitted_params(model::YourModel, fitresult) end ``` -**Example**: for [GLM models](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L133-L137) +**Example**: for [GLM models](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L133-L137) ### Summary of user interface points (or, What to put where?) @@ -315,9 +315,9 @@ whose performance may suffice. **Examples** -- Deterministic regression: [KNNRegressor](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/NearestNeighbors.jl#L124-L145) -- Probabilistic regression: [LinearRegressor](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L154-L158) and the [`predict_mean`](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L144-L147) -- Probabilistic classification: [LogisticClassifier](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L165-L168) +- Deterministic regression: [KNNRegressor](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/NearestNeighbors.jl#L124-L145) +- Probabilistic regression: [LinearRegressor](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L154-L158) and the [`predict_mean`](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L144-L147) +- Probabilistic classification: [LogisticClassifier](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L165-L168) ### Metadata @@ -359,16 +359,16 @@ correct `load_path` MLJ will be unable to import your model. **Examples**: - package metadata - - [GLM](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L179-L186) - - [MLJLinearModels](https://github.com/alan-turing-institute/MLJLinearModels.jl/blob/289a373a8357c4afc191711d0218aa1523e97f70/src/mlj/interface.jl#L91-L97) + - [GLM](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L179-L186) + - [MLJLinearModels](https://github.com/JuliaAI/MLJLinearModels.jl/blob/289a373a8357c4afc191711d0218aa1523e97f70/src/mlj/interface.jl#L91-L97) - model metadata - - [LinearRegressor](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L188-L193) - - [DecisionTree](https://github.com/alan-turing-institute/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/DecisionTree.jl#L225-L229) - - [A series of regressors](https://github.com/alan-turing-institute/MLJLinearModels.jl/blob/289a373a8357c4afc191711d0218aa1523e97f70/src/mlj/interface.jl#L105-L111) + - [LinearRegressor](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/GLM.jl#L188-L193) + - [DecisionTree](https://github.com/JuliaAI/MLJModels.jl/blob/3687491b132be8493b6f7a322aedf66008caaab1/src/DecisionTree.jl#L225-L229) + - [A series of regressors](https://github.com/JuliaAI/MLJLinearModels.jl/blob/289a373a8357c4afc191711d0218aa1523e97f70/src/mlj/interface.jl#L105-L111) ### Adding a model to the model registry -See [here](https://github.com/alan-turing-institute/MLJModels.jl/tree/master#instructions-for-updating-the-mlj-model-registry). +See [here](https://github.com/JuliaAI/MLJModels.jl/tree/master#instructions-for-updating-the-mlj-model-registry). diff --git a/docs/src/third_party_packages.md b/docs/src/third_party_packages.md index e6692a661..5469da674 100644 --- a/docs/src/third_party_packages.md +++ b/docs/src/third_party_packages.md @@ -13,15 +13,15 @@ post an issue requesting this - [ScikitLearn.jl](https://github.com/cstjean/ScikitLearn.jl) - [DecisionTree.jl](https://github.com/bensadeghi/DecisionTree.jl) - [MultivariateStats.jl](https://github.com/JuliaStats/MultivariateStats.jl) -- [MLJModels.jl](https://github.com/alan-turing-institute/MLJModels.jl) -- [MLJLinearModels.jl](https://github.com/alan-turing-institute/MLJLinearModels.jl) +- [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) +- [MLJLinearModels.jl](https://github.com/JuliaAI/MLJLinearModels.jl) - [LIBSVM.jl](https://github.com/mpastell/LIBSVM.jl) - [EvoTrees.jl](https://github.com/Evovest/EvoTrees.jl) - [NaiveBayes.jl](https://github.com/dfdx/NaiveBayes.jl) -- [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl) (depending on [Flux.jl](https://github.com/FluxML/Flux.jl)) +- [MLJFlux.jl](https://github.com/JuliaAI/MLJFlux.jl) (depending on [Flux.jl](https://github.com/FluxML/Flux.jl)) - [Clustering.jl](https://github.com/JuliaStats/Clustering.jl) - [ParallelKMeans.jl](https://github.com/PyDataBlog/ParallelKMeans.jl) -- [NearestNeighborModels.jl](https://github.com/alan-turing-institute/NearestNeighborModels.jl) (depending on [NearestNeighbors.jl](https://github.com/KristofferC/NearestNeighbors.jl)) +- [NearestNeighborModels.jl](https://github.com/JuliaAI/NearestNeighborModels.jl) (depending on [NearestNeighbors.jl](https://github.com/KristofferC/NearestNeighbors.jl)) - [PartialLeastSquaresRegressor.jl](https://github.com/lalvim/PartialLeastSquaresRegressor.jl) - [LightGBM.jl](https://github.com/IQVIA-ML/LightGBM.jl) - [GLM.jl](https://github.com/JuliaStats/GLM.jl) diff --git a/docs/src/tuning_models.md b/docs/src/tuning_models.md index ea7f5c937..3b6411a28 100644 --- a/docs/src/tuning_models.md +++ b/docs/src/tuning_models.md @@ -5,7 +5,7 @@ Below we illustrate hyperparameter optimisation using the tuning strategies. Also available is the [tree Parzen](https://github.com/IQVIA-ML/TreeParzen.jl) strategy; for a complete list, see - [here](https://github.com/alan-turing-institute/MLJTuning.jl#what-is-provided-here). + [here](https://github.com/JuliaAI/MLJTuning.jl#what-is-provided-here). MLJ tuning is implemented as an *iterative* procedure, which can accordingly be controlled using MLJ's [`IteratedModel`](@ref @@ -24,7 +24,7 @@ model may be viewed as a "self-tuning" version of the unwrapped model. For in-depth overview of tuning in MLJ, or for implementation details, see the [MLJTuning -documentation](https://github.com/alan-turing-institute/MLJTuning.jl). For +documentation](https://github.com/JuliaAI/MLJTuning.jl). For a complete list of options see the [`TunedModel`](@ref) doc-string below. diff --git a/docs/src/working_with_categorical_data.md b/docs/src/working_with_categorical_data.md index cb5cdb6a4..feebff865 100644 --- a/docs/src/working_with_categorical_data.md +++ b/docs/src/working_with_categorical_data.md @@ -3,8 +3,8 @@ ## Scientific types for discrete data Recall that models articulate their data requirements using scientific -types (see [Getting Started](@ref) or the [MLJScientificTypes.jl -documentation](https://alan-turing-institute.github.io/MLJScientificTypes.jl/dev/)). There +types (see [Getting Started](@ref) or the [ScientificTypes.jl +documentation](https://JuliaAI.github.io/ScientificTypes.jl/dev/)). There are three scientific types discrete data can have: `Count`, `OrderedFactor` and `Multiclass`. @@ -62,7 +62,7 @@ above. To inspect all column scientific types in a table simultaneously, use `schema`. (The `scitype(X)` of a table `X` contains a condensed form of this information used in type dispatch; see -[here](https://github.com/alan-turing-institute/ScientificTypes.jl#more-on-the-table-type).) +[here](https://github.com/JuliaAI/ScientificTypesBase.jl#more-on-the-table-type).) ```@example hut import DataFrames.DataFrame diff --git a/material/IQVIA_logo.png b/material/IQVIA_logo.png new file mode 100644 index 000000000..aaaf0473e Binary files /dev/null and b/material/IQVIA_logo.png differ diff --git a/material/MLJ_stack_transparent.svg b/material/MLJ_stack_transparent.svg new file mode 100644 index 000000000..705cea13c --- /dev/null +++ b/material/MLJ_stack_transparent.svg @@ -0,0 +1,3 @@ + + +
MLJModelInterface
MLJModelInterface
ScientificTypesBase
ScientificTypesBase
ScientificTypes
ScientificTypes
MLJBase
MLJBase
MLJTuning
MLJTuning
MLJModels
MLJModels
MLJLinearModels
MLJLinearModels
ThirdPartyModelPkg
ThirdPartyModelPkg
MLJ
MLJ
(StatisticalMeasures)
(StatisticalMeasures)
MLJEnsembles
MLJEnsembles
DataScienceTutorials
DataScienceTutorials
MLJFlux
MLJFlux
satellite packages
satellite packages
A            B  means "A depends on B"
A            B  means "A depends on...
 For general MLJ Users
 For general MLJ Users
For model  API implementation/testing

For model  API implementation/testin...
 Of interest outside MLJ
 Of interest outside MLJ
For MLJ developers
For MLJ developers
StatisticalTraits
StatisticalTraits
MLJIteration
MLJIteration
MLJOpenML
MLJOpenML
MLJSerialization
MLJSerializat...
MLJClusteringInterface
MLJClusteringInterf...
MLJDecisionTreeInterface
MLJDecisionTreeInte...
MLJGLMInterface
MLJGLMInterface
MLJLIBSVMInterface
MLJLIBSVMInterface
MLJMultivariateStatsInterface
MLJMultivariateStat...
MLJNaiveBayesInterface 
MLJNaiveBayesInterf...
MLJScikitLearnInterface
MLJScikitLearnInter...
MLJXGBoostInterface
MLJXGBoostInterface
Interfaces for third party
packages administered by MLJ:
Interfaces for third party...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/material/NESI-logo.png b/material/NESI-logo.png new file mode 100644 index 000000000..12a1b8f6d Binary files /dev/null and b/material/NESI-logo.png differ diff --git a/material/Turing_logo.png b/material/Turing_logo.png new file mode 100644 index 000000000..e6ec5e06b Binary files /dev/null and b/material/Turing_logo.png differ diff --git a/material/UoA_logo.png b/material/UoA_logo.png new file mode 100644 index 000000000..f387097f8 Binary files /dev/null and b/material/UoA_logo.png differ diff --git a/material/julia.png b/material/julia.png new file mode 100644 index 000000000..80a9bc638 Binary files /dev/null and b/material/julia.png differ diff --git a/material/warwick.png b/material/warwick.png new file mode 100644 index 000000000..cbf65295c Binary files /dev/null and b/material/warwick.png differ diff --git a/paper/paper.md b/paper/paper.md index 6e05a37c5..468c4bde1 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -130,7 +130,7 @@ representation of probabilities, are avoided. **Scientific types** To help users focus less on data representation (e.g., `Float32`, `DataFrame`) and more on the intended *purpose* or *interpretation* of data, MLJ articulates model data requirements -using *scientific types* [@ScientificTypes], such as "continuous", +using *scientific types* [@ScientificTypesBase], such as "continuous", "ordered factor" or "table". **Connecting models directly to arbitrary data containers**. A diff --git a/src/MLJ.jl b/src/MLJ.jl index 2ae80f8cf..c8b1522f5 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -33,7 +33,7 @@ using ComputationalResources: CPUProcesses import MLJBase: fit, update, clean!, fit!, predict, fitted_params, show_as_constructed, == import MLJModels: models -import MLJScientificTypes +import ScientificTypes ## METHOD EXPORT @@ -47,7 +47,7 @@ export MLJ_VERSION export pdf, logpdf, mode, median, mean, shuffle!, categorical, shuffle, levels, levels!, std, support, sampler -# re-exports from (MLJ)ScientificTypes via MLJBase +# re-exports from (MLJ)ScientificTypesBase via MLJBase export Scientific, Found, Unknown, Known, Finite, Infinite, OrderedFactor, Multiclass, Count, Continuous, Textual, Binary, ColorImage, GrayImage, Image, Table @@ -222,6 +222,6 @@ const srcdir = dirname(@__FILE__) ## INCLUDE FILES include("version.jl") # defines MLJ_VERSION constant -include("scitypes.jl") # extensions to ScientificTypes.scitype +include("scitypes.jl") # extensions to ScientificTypesBase.scitype end # module diff --git a/src/scitypes.jl b/src/scitypes.jl index 314ce510d..537375d48 100644 --- a/src/scitypes.jl +++ b/src/scitypes.jl @@ -2,21 +2,21 @@ # This implementation of scitype for models and measures is highly experimental -const MST = MLJScientificTypes # only used in this file +const ST = ScientificTypes # only used in this file struct SupervisedScitype{input_scitype, target_scitype, prediction_type} end -MST.scitype(model::Deterministic, ::MST.MLJ) = +ST.scitype(model::Deterministic, ::ST.DefaultConvention) = SupervisedScitype{input_scitype(model), target_scitype(model), :deterministic} -MST.scitype(model::Probabilistic, ::MST.MLJ) = +ST.scitype(model::Probabilistic, ::ST.DefaultConvention) = SupervisedScitype{input_scitype(model), target_scitype(model), :probabilistic} -MST.scitype(model::Interval, ::MST.MLJ) = +ST.scitype(model::Interval, ::ST.DefaultConvention) = SupervisedScitype{input_scitype(model), target_scitype(model), :interval} @@ -52,7 +52,7 @@ end struct UnsupervisedScitype{input_scitype, output_scitype} end -MST.scitype(model::Unsupervised, ::MST.MLJ) = +ST.scitype(model::Unsupervised, ::ST.DefaultConvention) = UnsupervisedScitype{input_scitype(model), MLJBase.output_scitype(model)} @@ -91,7 +91,7 @@ struct MeasureScitype{target_scitype, is_feature_dependent, supports_weights} end -MST.scitype(measure, ::MST.MLJ, ::Val{:measure}) = +ST.scitype(measure, ::ST.DefaultConvention, ::Val{:measure}) = MeasureScitype{target_scitype(measure), prediction_type(measure), orientation(measure),