[ Layer ] Mixed Precision support for BN Layer

This PR includes Mixed Precision support for batch normalization layer. When the training, BN layer should run full precsion with FP16 Weight data. Therefore, Reading the FP16 data read and data coversion of the current Weight and Activation are required. For the Inference, we do need compiler optimization like bn fusing. So it also includes execution mode parameters for compile. Because of compilcate data conversion of bn layer, test case generation also needs to update, so that taking the fp16 input,output tensors and weights and converting FP32 weight for computation. For veification, we do need convert FP32 to FP16. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon <[email protected]>
nnstreamer · Jun 3, 2024 · a5b1545 · a5b1545
1 parent 9cb71dd
commit a5b1545
Show file tree

Hide file tree

Showing 10 changed files with 288 additions and 56 deletions.
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
@@ -938,6 +938,9 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
     }
   }
 
+  lnode->setDataType(init_context.getWeightDataType(),
+                     init_context.getActivationDataType());
+
   lnode->configureRunContext(
     // TODO: update weights spec for trainable based on layer trainable prop
     tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
@@ -1198,8 +1201,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
      * Initialize all the layers, allocate output tensors for each layer
      * init2and add optimizer related weights for the layer
      */
-    const std::vector<Var_Grad *> &outputs =
-      finalizeContext(lnode, inputs);
+    const std::vector<Var_Grad *> &outputs = finalizeContext(lnode, inputs);
 
     /** no need to update input_map for the last layer */
     if (idx == graph.size() - 1)

diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp
@@ -73,6 +73,10 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
 
   TensorDim dim(context.getFormat(), context.getWeightDataType());
 
+  if (context.getExecutionMode() == ml::train::ExecutionMode::TRAIN) {
+    dim.setDataType(TensorDim::DataType::FP32);
+  }
+
   /// @note this logic cannot tell channel is actually 1 or it is just not used.
   auto &axis_prop = std::get<props::Axis>(bn_props);
   unsigned int axis;
@@ -99,26 +103,32 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
   }
 
   wt_idx[BNParams::mu] =
-    context.requestWeight(dim, bnparams_mu, WeightRegularizer::NONE, 1.0f, 0.0f,
-                          "moving_mean", false);
+    context.requestWeight(dim, dim, bnparams_mu, WeightRegularizer::NONE, 1.0f,
+                          0.0f, "moving_mean", false);
   wt_idx[BNParams::var] =
-    context.requestWeight(dim, bnparams_var, WeightRegularizer::NONE, 1.0f,
+    context.requestWeight(dim, dim, bnparams_var, WeightRegularizer::NONE, 1.0f,
                           0.0f, "moving_variance", false);
   wt_idx[BNParams::gamma] =
-    context.requestWeight(dim, bnparams_gamma, WeightRegularizer::NONE, 1.0f,
-                          weight_decay, "gamma", true);
+    context.requestWeight(dim, dim, bnparams_gamma, WeightRegularizer::NONE,
+                          1.0f, weight_decay, "gamma", true);
   wt_idx[BNParams::beta] =
-    context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f,
-                          bias_decay, "beta", true);
+    context.requestWeight(dim, dim, bnparams_beta, WeightRegularizer::NONE,
+                          1.0f, bias_decay, "beta", true);
 
   /**
    * caches the deviation -> input - avg(input)
    * @todo check if avoiding this storage and adding dependency on input (no
    * more in-place calculation) can save memory during memory optimization.
    */
+  TensorDim in_dim_ = in_dim;
+
+  if (context.getExecutionMode() == ml::train::ExecutionMode::TRAIN) {
+    in_dim_.setDataType(TensorDim::DataType::FP32);
+  }
+
   wt_idx[BNParams::deviation] =
-    context.requestTensor(in_dim, "deviation", Tensor::Initializer::NONE, false,
-                          TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(in_dim_, "deviation", Tensor::Initializer::NONE,
+                          false, TensorLifespan::ITERATION_LIFESPAN);
   /** caches the inverse standard deviation */
   wt_idx[BNParams::invstd] =
     context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
@@ -130,7 +140,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
    * as the output of this layer need not be stored all the time.
    */
   wt_idx[BNParams::t_full] =
-    context.requestTensor(in_dim, "tensor_full", Tensor::Initializer::NONE,
+    context.requestTensor(in_dim_, "tensor_full", Tensor::Initializer::NONE,
                           false, TensorLifespan::CALC_DERIV_LIFESPAN);
   /**
    * caches variance + epsilon as well.
@@ -164,8 +174,32 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context,
   Tensor &gamma = context.getWeight(wt_idx[BNParams::gamma]);
   Tensor &beta = context.getWeight(wt_idx[BNParams::beta]);
 
-  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
-  Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
+  Tensor em_input, em_hidden;
+
+  Tensor &input_ = em_input;
+  Tensor &hidden_ = em_hidden;
+
+  if (training) {
+    if (context.getInput(SINGLE_INOUT_IDX).getDataType() !=
+        TensorDim::DataType::FP32) {
+      input_ =
+        context.getInput(SINGLE_INOUT_IDX).clone(TensorDim::DataType::FP32);
+    } else {
+      input_ = context.getInput(SINGLE_INOUT_IDX);
+    }
+
+    if (context.getOutput(SINGLE_INOUT_IDX).getDataType() !=
+        TensorDim::DataType::FP32) {
+      hidden_ =
+        context.getOutput(SINGLE_INOUT_IDX).clone(TensorDim::DataType::FP32);
+    } else {
+      hidden_ = context.getOutput(SINGLE_INOUT_IDX);
+    }
+  } else {
+    input_ = context.getInput(SINGLE_INOUT_IDX);
+    hidden_ = context.getOutput(SINGLE_INOUT_IDX);
+  }
+
   Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]);
   Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]);
 
@@ -200,21 +234,46 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context,
   deviation.multiply(invstd, hidden_);
   hidden_.multiply_i(gamma);
   hidden_.add_i(beta);
+
+  if (training && hidden_.getDataType() !=
+                    context.getOutput(SINGLE_INOUT_IDX).getDataType())
+    context.getOutput(SINGLE_INOUT_IDX).copyData(hidden_);
 }
 
 void BatchNormalizationLayer::calcDerivative(RunLayerContext &context) {
 
   Tensor &gamma = context.getWeight(wt_idx[BNParams::gamma]);
-  const Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
-  Tensor &dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+
+  Tensor em_dx, deriv32;
+  bool deriv_copyed = false;
+
+  const Tensor deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+
+  if (deriv.getDataType() != TensorDim::DataType::FP32) {
+    deriv_copyed = true;
+    TensorDim dim = deriv.getDim();
+    dim.setDataType(TensorDim::DataType::FP32);
+    deriv32 = Tensor(dim, true);
+    deriv32.copyData(deriv);
+  }
+
+  Tensor &dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() ==
+                   TensorDim::DataType::FP32
+                 ? context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+                 : em_dx;
+
+  if (dx.empty())
+    dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+           .clone(TensorDim::DataType::FP32);
+
   Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]);
   Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]);
   Tensor &cvar = context.getTensor(wt_idx[BNParams::cvar]);
 
   Tensor &t_reduced = context.getTensor(wt_idx[BNParams::t_reduced]);
   Tensor &t_full = context.getTensor(wt_idx[BNParams::t_full]);
 
-  deviation.multiply(deriv, t_full);
+  deviation.multiply((deriv_copyed ? deriv32 : deriv), t_full);
   t_full.average(axes_to_reduce, t_reduced);
   t_reduced.divide_i(cvar);
   deviation.multiply_i(t_reduced);
@@ -233,22 +292,37 @@ void BatchNormalizationLayer::calcDerivative(RunLayerContext &context) {
     Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]);
     dbeta.divide(divider, t_reduced);
   } else {
-    deriv.average(axes_to_reduce, t_reduced);
+    (deriv_copyed ? deriv32 : deriv).average(axes_to_reduce, t_reduced);
   }
 
-  deriv.subtract(t_reduced, dx);
+  (deriv_copyed ? deriv32 : deriv).subtract(t_reduced, dx);
   dx.subtract_i(deviation);
 
   invstd.multiply_i(gamma);
   dx.multiply_i(invstd);
+
+  if (dx.getDataType() !=
+      context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType())
+    context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(dx);
 }
 
 void BatchNormalizationLayer::calcGradient(RunLayerContext &context) {
   /** dgamma is calculated in calcDerivative. dbeta is calculated here */
   Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]);
-  const Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
 
-  deriv.sum(axes_to_reduce, dbeta);
+  Tensor deriv32;
+  bool deriv_copyed = false;
+
+  const Tensor deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+  if (deriv.getDataType() != TensorDim::DataType::FP32) {
+    deriv_copyed = true;
+    TensorDim dim = deriv.getDim();
+    dim.setDataType(TensorDim::DataType::FP32);
+    deriv32 = Tensor(dim, true);
+    deriv32.copyData(deriv);
+  }
+
+  (deriv_copyed ? deriv32 : deriv).sum(axes_to_reduce, dbeta);
 }
 
 void BatchNormalizationLayer::exportTo(

diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
@@ -294,6 +294,7 @@ const Tensor RunLayerContext::getIncomingDerivative(unsigned int idx) const {
   return getOutputGrad(idx);
 }
 
+
 /**
  * @brief Get the Input tensor object
  *

diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
@@ -210,6 +210,35 @@ class InitLayerContext {
     return weights_spec.size() - 1;
   }
 
+  /**
+   * @brief Request a new weight for the layer
+   *
+   * @param dim dimension of Variable of the weight
+   * @param dim_g dimension of Gradient of the weight
+   * @param init initializer for the weight
+   * @param reg regularizer for the weight
+   * @param reg_const regularization constant for the weight
+   * @param name name of the weight
+   * @param trainable if the weight is trainable (require gradient or not)
+   * @return unsigned int index of the weight for its getter
+   *
+   * @todo Consider providing a guarantee that the returned indices will always
+   * start from 0 and will always be incremental.
+   */
+  unsigned int requestWeight(const TensorDim &dim, const TensorDim &dim_g,
+                             const Tensor::Initializer init,
+                             const WeightRegularizer reg, const float reg_const,
+                             const float decay, const std::string &name,
+                             bool trainable = true, unsigned int out_axis = 3) {
+
+    /** @note : We assumes the gradient type is same with Activation data
+     * type.*/
+    weights_spec.emplace_back(dim, dim_g, init, reg, reg_const, decay,
+                              clip_by_global_norm, trainable,
+                              prefix + ":" + name, out_axis, loss_scale);
+    return weights_spec.size() - 1;
+  }
+
   /**
    * @brief Request a new weight for the layer
    *

diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
@@ -19,6 +19,7 @@
 #include <utility>
 
 #include <activation_layer.h>
+#include <bn_layer.h>
 #include <app_context.h>
 #include <base_properties.h>
 #include <common_properties.h>
@@ -460,9 +461,11 @@ void LayerNode::exportTo(Exporter &exporter,
   layer->exportTo(exporter, method);
 }
 
-void LayerNode::read(std::ifstream &file, bool opt_var) {
+void LayerNode::read(std::ifstream &file, bool opt_var,
+                     ml::train::ExecutionMode mode) {
   NNTR_THROW_IF(!run_context, std::runtime_error)
     << __func__ << " layer needs to be finalized first!";
+
   if (opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       if (run_context->isGradientLastAccess(i) && getTrainable()) {
@@ -473,10 +476,29 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
       }
     }
   } else {
+
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       /// @note shared weights are only be read at the first acecss
       if (run_context->isGradientLastAccess(i)) {
-        run_context->getWeight(i).read(file);
+        if (layer->getType() == BatchNormalizationLayer::type) {
+          if ((mode == ml::train::ExecutionMode::TRAIN) &&
+              (this->getWeightDataType() != TensorDim::DataType::FP32)) {
+
+            /** @note for batch normalization layer, we do need full precision
+             * for training. but weight can be saved with other type. for
+             * training, bn weight type is fixed with full precsion */
+
+            TensorDim dim = run_context->getWeight(i).getDim();
+            dim.setDataType(this->getWeightDataType());
+            Tensor T_read(dim, true);
+            T_read.read(file);
+            run_context->getWeight(i).copyData(T_read);
+          } else {
+            run_context->getWeight(i).read(file);
+          }
+        } else {
+          run_context->getWeight(i).read(file);
+        }
         if (run_context->isMixedPrecision(i) && getTrainable()) {
           run_context->getWeightFP32(i).copyData(run_context->getWeight(i));
         }
@@ -485,7 +507,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
   }
 }
 
-void LayerNode::save(std::ofstream &file, bool opt_var) const {
+void LayerNode::save(std::ofstream &file, bool opt_var,
+                     ml::train::ExecutionMode mode) const {
   NNTR_THROW_IF(!run_context, std::runtime_error)
     << __func__ << " layer needs to be finalized first!";
 
@@ -505,7 +528,29 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
     // @note shared weights are only be saved at the first access
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       if (run_context->isGradientLastAccess(i)) {
-        run_context->getWeight(i).save(file);
+
+        /** @note For batch normalization layer, we do need full precision for
+         * training and the data type of weight is full precision. But for
+         * inference, We do have to save them as activation data type. */
+
+        if (layer->getType() == BatchNormalizationLayer::type) {
+          if ((mode == ml::train::ExecutionMode::TRAIN) &&
+              (this->getWeightDataType() != TensorDim::DataType::FP32)) {
+            TensorDim dim = run_context->getWeight(i).getDim();
+
+            dim.setDataType(this->getWeightDataType());
+
+            Tensor T_save(dim, true);
+
+            T_save.copyData(run_context->getWeight(i));
+
+            T_save.save(file);
+          } else {
+            run_context->getWeight(i).save(file);
+          }
+        } else {
+          run_context->getWeight(i).save(file);
+        }
       }
     }
   }
-Original file line number
+Diff line change
@@ Expand Up @@
       return getOutputGrad(idx);
     }
     /**
      * @brief Get the Input tensor object
      *
@@ Expand Down @@