From f4991e52765f8e58ca05087db86c0743257590c8 Mon Sep 17 00:00:00 2001
From: lfenzo <lfenzo@protonmail.com>
Date: Mon, 30 May 2022 17:33:48 -0300
Subject: [PATCH 1/6] Reformatted make.jl in docs dir

---
 docs/make.jl | 81 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 129129781c..52fb0a75d6 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,39 +1,50 @@
 using Documenter, Flux, NNlib, Functors, MLUtils, BSON
 
+
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
-makedocs(modules = [Flux, NNlib, Functors, MLUtils, BSON],
-         doctest = false,
-         sitename = "Flux",
-         pages = ["Home" => "index.md",
-                  "Building Models" =>
-                    ["Overview" => "models/overview.md",
-                     "Basics" => "models/basics.md",
-                     "Recurrence" => "models/recurrence.md",
-                     "Model Reference" => "models/layers.md",
-                     "Loss Functions" => "models/losses.md",
-                     "Regularisation" => "models/regularisation.md",
-                     "Advanced Model Building" => "models/advanced.md",
-                     "NNlib" => "models/nnlib.md",
-                     "Functors" => "models/functors.md"],
-                  "Handling Data" =>
-                    ["One-Hot Encoding" => "data/onehot.md",
-                     "MLUtils" => "data/mlutils.md"],
-                  "Training Models" =>
-                    ["Optimisers" => "training/optimisers.md",
-                     "Training" => "training/training.md"],
-                  "GPU Support" => "gpu.md",
-                  "Saving & Loading" => "saving.md",
-                  "The Julia Ecosystem" => "ecosystem.md",
-                  "Utility Functions" => "utilities.md",
-                  "Performance Tips" => "performance.md",
-                  "Datasets" => "datasets.md",
-                  "Community" => "community.md"],
-         format = Documenter.HTML(
-             analytics = "UA-36890222-9",
-             assets = ["assets/flux.css"],
-             prettyurls = get(ENV, "CI", nothing) == "true"),
-         )
 
-deploydocs(repo = "github.com/FluxML/Flux.jl.git",
-           target = "build",
-           push_preview = true)
+makedocs(
+    modules = [Flux, NNlib, Functors, MLUtils, BSON],
+    doctest = false,
+    sitename = "Flux",
+    pages = [
+        "Home" => "index.md",
+        "Building Models" => [
+            "Overview" => "models/overview.md",
+            "Basics" => "models/basics.md",
+            "Recurrence" => "models/recurrence.md",
+            "Model Reference" => "models/layers.md",
+            "Loss Functions" => "models/losses.md",
+            "Regularisation" => "models/regularisation.md",
+            "Advanced Model Building" => "models/advanced.md",
+            "NNlib" => "models/nnlib.md",
+            "Functors" => "models/functors.md"
+         ],
+         "Handling Data" => [
+             "One-Hot Encoding" => "data/onehot.md",
+             "MLUtils" => "data/mlutils.md"
+         ],
+         "Training Models" => [
+             "Optimisers" => "training/optimisers.md",
+             "Training" => "training/training.md"
+         ],
+         "GPU Support" => "gpu.md",
+         "Saving & Loading" => "saving.md",
+         "The Julia Ecosystem" => "ecosystem.md",
+         "Utility Functions" => "utilities.md",
+         "Performance Tips" => "performance.md",
+         "Datasets" => "datasets.md",
+         "Community" => "community.md"
+    ],
+    format = Documenter.HTML(
+        analytics = "UA-36890222-9",
+        assets = ["assets/flux.css"],
+        prettyurls = get(ENV, "CI", nothing) == "true"
+    ),
+)
+
+deploydocs(
+    repo = "github.com/FluxML/Flux.jl.git",
+    target = "build",
+    push_preview = true
+)

From eb2927749b362c6d4708f3df85262b6bf1b9537f Mon Sep 17 00:00:00 2001
From: lfenzo <lfenzo@protonmail.com>
Date: Mon, 30 May 2022 20:42:09 -0300
Subject: [PATCH 2/6] docs: added first draft of common gpu workflows in the
 GPU Support page

---
 docs/src/gpu.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index c920b16fbc..1cf29cf2c8 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -1,6 +1,6 @@
 # GPU Support
 
-NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CUDA](https://github.com/JuliaGPU/CUDA.jl) readme.
+NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) readme.
 
 ## Checking GPU Availability
 
@@ -86,6 +86,64 @@ julia> x |> cpu
  0.7766742
 ```
 
+## Common GPU Workflows
+
+Some of the common workflows involving the use of GPUs are presented below.
+
+### Transferring Training Data
+
+In order to train the model using the GPU both model and the training data have be transferred to GPU memory. This process can be done with the `gpu` function in two different  ways:
+
+1. Iterating over the batches in a [DataLoader](@ref) object transfering each one of the training batches at a time to the GPU. 
+   ```julia
+   train_loader = Flux.DataLoader((xtrain, ytrain), batchsize = 64, shuffle = true)
+   # ... model, optimizer and loss definitions
+   for epoch in 1:nepochs
+       for (xtrain_batch, ytrain_batch) in train_loader
+           x, y = gpu(xtrain_batch), gpu(ytrain_batch)
+           gradients = gradient(() -> loss(x, y), parameters)
+           Flux.Optimise.update!(optimizer, parameters, gradients)
+       end
+   end
+   ```
+
+1. Transferring all training data to the GPU at once before creating the [DataLoader](@ref) object. This is usually performed for smaller datasets which are sure to fit in the available GPU memory. Some possitilities are:
+   ```julia
+   gpu_x = gpu(xtrain)
+   gpu_y = gpu(ytrain)
+
+   gpu_train_loader = Flux.DataLoader((gpu_x, gpu_y), batchsize = 32)
+   ```
+   ```julia
+   gpu_train_loader = Flux.DataLoader((xtrain |> gpu, ytrain |> gpu), batchsize = 32)
+   ```
+   ```julia
+   gpu_train_loader = Flux.DataLoader(gpu.(collect.((xtrain, ytrain))), batchsize = 32)
+   ```
+
+### Saving GPU-Trained Models
+
+After the training process is done one must always transfer the trained model back to the `cpu` memory scope before saving it to secundary memory. This can be done, as described in the previous section, with:
+```julia
+model = cpu(model) # or model = model |> cpu
+```
+and then
+```julia
+using BSON
+# ...
+BSON.@save "./path/to/trained_model.bson" model
+
+# in this approach the cpu-transferred model (referenced by the variable `model`)
+# only exists inside the `let` statement
+let model = cpu(model)
+   BSON.@save "./path/to/trained_model.bson" model
+end
+```
+The reason behind this is that models trained in the GPU but not transferred to the CPU memory scope will expect `CuArray`s as input. In other words, Flux models expect input data coming from the same kind device in which they were trained on.
+
+In controlled scenarios in which the data fed to the loaded models is garanteed to be in the GPU there's no need to transfer them back to CPU memory scope, however in production environments, where artifacts are shared among different processes, equipments or configurations, there is no garantee that the CUDA.jl package will be available for the process performing inference on the model loaded from the disk.
+
+
 ## Disabling CUDA or choosing which GPUs are visible to Flux
 
 Sometimes it is required to control which GPUs are visible to `julia` on a system with multiple GPUs or disable GPUs entirely. This can be achieved with an environment variable `CUDA_VISIBLE_DEVICES`.

From 203c21c8b8e648748d3d19643e081a1f2ce543d8 Mon Sep 17 00:00:00 2001
From: lfenzo <lfenzo@protonmail.com>
Date: Wed, 1 Jun 2022 19:38:46 -0300
Subject: [PATCH 3/6] docs: updates on common gpu workflows after first review

---
 docs/src/gpu.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 1cf29cf2c8..1ca4ac70dd 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -118,8 +118,9 @@ In order to train the model using the GPU both model and the training data have
    gpu_train_loader = Flux.DataLoader((xtrain |> gpu, ytrain |> gpu), batchsize = 32)
    ```
    ```julia
-   gpu_train_loader = Flux.DataLoader(gpu.(collect.((xtrain, ytrain))), batchsize = 32)
+   gpu_train_loader = Flux.DataLoader((xtrain, ytrain) |> gpu, batchsize = 32)
    ```
+   Note that both `gpu` and `cpu` are smart enough to recurse through tuples and namedtuples.
 
 ### Saving GPU-Trained Models
 
@@ -136,8 +137,12 @@ BSON.@save "./path/to/trained_model.bson" model
 # in this approach the cpu-transferred model (referenced by the variable `model`)
 # only exists inside the `let` statement
 let model = cpu(model)
+   # ...
    BSON.@save "./path/to/trained_model.bson" model
 end
+
+# is equivalente to the above, but uses `key=value` storing directve from BSON.jl
+BSON.@save "./path/to/trained_model.bson" model = cpu(model)
 ```
 The reason behind this is that models trained in the GPU but not transferred to the CPU memory scope will expect `CuArray`s as input. In other words, Flux models expect input data coming from the same kind device in which they were trained on.
 

From f4c690d432e0e8fe95edc23cf1a48a612aac8651 Mon Sep 17 00:00:00 2001
From: Enzo L F <49812445+lfenzo@users.noreply.github.com>
Date: Thu, 2 Jun 2022 14:38:42 -0300
Subject: [PATCH 4/6] docs: small correction in docs/src/gpu.md

Co-authored-by: Carlo Lucibello <carlo.lucibello@gmail.com>
---
 docs/src/gpu.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 1ca4ac70dd..4709a77f15 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -92,7 +92,7 @@ Some of the common workflows involving the use of GPUs are presented below.
 
 ### Transferring Training Data
 
-In order to train the model using the GPU both model and the training data have be transferred to GPU memory. This process can be done with the `gpu` function in two different  ways:
+In order to train the model using the GPU both model and the training data have to be transferred to GPU memory. This process can be done with the `gpu` function in two different  ways:
 
 1. Iterating over the batches in a [DataLoader](@ref) object transfering each one of the training batches at a time to the GPU. 
    ```julia

From 3ba5718a2b2147f9a50998c607cd551272306b70 Mon Sep 17 00:00:00 2001
From: lfenzo <lfenzo@protonmail.com>
Date: Sat, 4 Jun 2022 11:48:11 -0300
Subject: [PATCH 5/6] docs: second batch of corrections on common gpu workflows

---
 docs/src/gpu.md | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 4709a77f15..da40723abb 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -107,20 +107,33 @@ In order to train the model using the GPU both model and the training data have
    end
    ```
 
-1. Transferring all training data to the GPU at once before creating the [DataLoader](@ref) object. This is usually performed for smaller datasets which are sure to fit in the available GPU memory. Some possitilities are:
-   ```julia
-   gpu_x = gpu(xtrain)
-   gpu_y = gpu(ytrain)
-
-   gpu_train_loader = Flux.DataLoader((gpu_x, gpu_y), batchsize = 32)
-   ```
+2. Transferring all training data to the GPU at once before creating the [DataLoader](@ref) object. This is usually performed for smaller datasets which are sure to fit in the available GPU memory. Some possitilities are:
    ```julia
    gpu_train_loader = Flux.DataLoader((xtrain |> gpu, ytrain |> gpu), batchsize = 32)
    ```
    ```julia
    gpu_train_loader = Flux.DataLoader((xtrain, ytrain) |> gpu, batchsize = 32)
    ```
-   Note that both `gpu` and `cpu` are smart enough to recurse through tuples and namedtuples.
+   Note that both `gpu` and `cpu` are smart enough to recurse through tuples and namedtuples. Other possibility is to use [`MLUtils.mapsobs`](https://juliaml.github.io/MLUtils.jl/dev/api/#MLUtils.mapobs) to push the data movement invocation into the background thread:
+   ```julia
+   using MLUtils: mapobs
+   # ...
+   gpu_train_loader = Flux.DataLoader(mapobs(gpu, (xtrain, ytrain)), batchsize = 16)
+   ```
+
+3. Wrapping the `DataLoader` in [`CUDA.CuIterator`](https://cuda.juliagpu.org/stable/usage/memory/#Batching-iterator) to efficiently move data to GPU on demand:
+   ```julia
+   using CUDA: CuIterator
+   train_loader = Flux.DataLoader((xtrain, ytrain), batchsize = 64, shuffle = true)
+   # ... model, optimizer and loss definitions
+   for epoch in 1:nepochs
+       for (xtrain_batch, ytrain_batch) in CuIterator(train_loader)
+          # ...
+       end
+   end
+   ```
+
+   Note that this works with a limited number of data types. If `iterate(train_loader)` returns anything other than arrays, approach 1 or 2 is preferred.
 
 ### Saving GPU-Trained Models
 

From f6907bea3bcfb51798863f7f04802a4b5abebe60 Mon Sep 17 00:00:00 2001
From: Enzo L F <49812445+lfenzo@users.noreply.github.com>
Date: Sat, 4 Jun 2022 14:42:01 -0300
Subject: [PATCH 6/6] docs: Update docs/src/gpu.md (common gpu workflows)

Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
---
 docs/src/gpu.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index da40723abb..cced9ca993 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -137,7 +137,7 @@ In order to train the model using the GPU both model and the training data have
 
 ### Saving GPU-Trained Models
 
-After the training process is done one must always transfer the trained model back to the `cpu` memory scope before saving it to secundary memory. This can be done, as described in the previous section, with:
+After the training process is done, one must always transfer the trained model back to the `cpu` memory scope before serializing or saving to disk. This can be done, as described in the previous section, with:
 ```julia
 model = cpu(model) # or model = model |> cpu
 ```