Skip to content

Commit

Permalink
[ Layer ] Mixed Precision support for BN Layer
Browse files Browse the repository at this point in the history
This PR includes Mixed Precision support for batch normalization
layer. When the training, BN layer should run full precsion with FP16
Weight data. Therefore, Reading the FP16 data read and data coversion
of the current Weight and Activation are required.

For the Inference, we do need compiler optimization like bn fusing. So
it also includes execution mode parameters for compile.

Because of compilcate data conversion of bn layer, test case
generation also needs to update, so that taking the fp16 input,output
tensors and weights and converting FP32 weight for computation.
For veification, we do need convert FP32 to FP16.

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <[email protected]>
  • Loading branch information
jijoongmoon committed Jun 3, 2024
1 parent 9cb71dd commit a5b1545
Show file tree
Hide file tree
Showing 10 changed files with 288 additions and 56 deletions.
6 changes: 4 additions & 2 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,9 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
}
}

lnode->setDataType(init_context.getWeightDataType(),
init_context.getActivationDataType());

lnode->configureRunContext(
// TODO: update weights spec for trainable based on layer trainable prop
tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
Expand Down Expand Up @@ -1198,8 +1201,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
* Initialize all the layers, allocate output tensors for each layer
* init2and add optimizer related weights for the layer
*/
const std::vector<Var_Grad *> &outputs =
finalizeContext(lnode, inputs);
const std::vector<Var_Grad *> &outputs = finalizeContext(lnode, inputs);

/** no need to update input_map for the last layer */
if (idx == graph.size() - 1)
Expand Down
112 changes: 93 additions & 19 deletions nntrainer/layers/bn_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {

TensorDim dim(context.getFormat(), context.getWeightDataType());

if (context.getExecutionMode() == ml::train::ExecutionMode::TRAIN) {
dim.setDataType(TensorDim::DataType::FP32);
}

/// @note this logic cannot tell channel is actually 1 or it is just not used.
auto &axis_prop = std::get<props::Axis>(bn_props);
unsigned int axis;
Expand All @@ -99,26 +103,32 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
}

wt_idx[BNParams::mu] =
context.requestWeight(dim, bnparams_mu, WeightRegularizer::NONE, 1.0f, 0.0f,
"moving_mean", false);
context.requestWeight(dim, dim, bnparams_mu, WeightRegularizer::NONE, 1.0f,
0.0f, "moving_mean", false);
wt_idx[BNParams::var] =
context.requestWeight(dim, bnparams_var, WeightRegularizer::NONE, 1.0f,
context.requestWeight(dim, dim, bnparams_var, WeightRegularizer::NONE, 1.0f,
0.0f, "moving_variance", false);
wt_idx[BNParams::gamma] =
context.requestWeight(dim, bnparams_gamma, WeightRegularizer::NONE, 1.0f,
weight_decay, "gamma", true);
context.requestWeight(dim, dim, bnparams_gamma, WeightRegularizer::NONE,
1.0f, weight_decay, "gamma", true);
wt_idx[BNParams::beta] =
context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f,
bias_decay, "beta", true);
context.requestWeight(dim, dim, bnparams_beta, WeightRegularizer::NONE,
1.0f, bias_decay, "beta", true);

/**
* caches the deviation -> input - avg(input)
* @todo check if avoiding this storage and adding dependency on input (no
* more in-place calculation) can save memory during memory optimization.
*/
TensorDim in_dim_ = in_dim;

if (context.getExecutionMode() == ml::train::ExecutionMode::TRAIN) {
in_dim_.setDataType(TensorDim::DataType::FP32);
}

wt_idx[BNParams::deviation] =
context.requestTensor(in_dim, "deviation", Tensor::Initializer::NONE, false,
TensorLifespan::ITERATION_LIFESPAN);
context.requestTensor(in_dim_, "deviation", Tensor::Initializer::NONE,
false, TensorLifespan::ITERATION_LIFESPAN);
/** caches the inverse standard deviation */
wt_idx[BNParams::invstd] =
context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
Expand All @@ -130,7 +140,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
* as the output of this layer need not be stored all the time.
*/
wt_idx[BNParams::t_full] =
context.requestTensor(in_dim, "tensor_full", Tensor::Initializer::NONE,
context.requestTensor(in_dim_, "tensor_full", Tensor::Initializer::NONE,
false, TensorLifespan::CALC_DERIV_LIFESPAN);
/**
* caches variance + epsilon as well.
Expand Down Expand Up @@ -164,8 +174,32 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context,
Tensor &gamma = context.getWeight(wt_idx[BNParams::gamma]);
Tensor &beta = context.getWeight(wt_idx[BNParams::beta]);

Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
Tensor em_input, em_hidden;

Tensor &input_ = em_input;
Tensor &hidden_ = em_hidden;

if (training) {
if (context.getInput(SINGLE_INOUT_IDX).getDataType() !=
TensorDim::DataType::FP32) {
input_ =
context.getInput(SINGLE_INOUT_IDX).clone(TensorDim::DataType::FP32);
} else {
input_ = context.getInput(SINGLE_INOUT_IDX);
}

if (context.getOutput(SINGLE_INOUT_IDX).getDataType() !=
TensorDim::DataType::FP32) {
hidden_ =
context.getOutput(SINGLE_INOUT_IDX).clone(TensorDim::DataType::FP32);
} else {
hidden_ = context.getOutput(SINGLE_INOUT_IDX);
}
} else {
input_ = context.getInput(SINGLE_INOUT_IDX);
hidden_ = context.getOutput(SINGLE_INOUT_IDX);
}

Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]);
Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]);

Expand Down Expand Up @@ -200,21 +234,46 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context,
deviation.multiply(invstd, hidden_);
hidden_.multiply_i(gamma);
hidden_.add_i(beta);

if (training && hidden_.getDataType() !=
context.getOutput(SINGLE_INOUT_IDX).getDataType())
context.getOutput(SINGLE_INOUT_IDX).copyData(hidden_);
}

void BatchNormalizationLayer::calcDerivative(RunLayerContext &context) {

Tensor &gamma = context.getWeight(wt_idx[BNParams::gamma]);
const Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
Tensor &dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX);

Tensor em_dx, deriv32;
bool deriv_copyed = false;

const Tensor deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);

if (deriv.getDataType() != TensorDim::DataType::FP32) {
deriv_copyed = true;
TensorDim dim = deriv.getDim();
dim.setDataType(TensorDim::DataType::FP32);
deriv32 = Tensor(dim, true);
deriv32.copyData(deriv);
}

Tensor &dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() ==
TensorDim::DataType::FP32
? context.getOutgoingDerivative(SINGLE_INOUT_IDX)
: em_dx;

if (dx.empty())
dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX)
.clone(TensorDim::DataType::FP32);

Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]);
Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]);
Tensor &cvar = context.getTensor(wt_idx[BNParams::cvar]);

Tensor &t_reduced = context.getTensor(wt_idx[BNParams::t_reduced]);
Tensor &t_full = context.getTensor(wt_idx[BNParams::t_full]);

deviation.multiply(deriv, t_full);
deviation.multiply((deriv_copyed ? deriv32 : deriv), t_full);
t_full.average(axes_to_reduce, t_reduced);
t_reduced.divide_i(cvar);
deviation.multiply_i(t_reduced);
Expand All @@ -233,22 +292,37 @@ void BatchNormalizationLayer::calcDerivative(RunLayerContext &context) {
Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]);
dbeta.divide(divider, t_reduced);
} else {
deriv.average(axes_to_reduce, t_reduced);
(deriv_copyed ? deriv32 : deriv).average(axes_to_reduce, t_reduced);
}

deriv.subtract(t_reduced, dx);
(deriv_copyed ? deriv32 : deriv).subtract(t_reduced, dx);
dx.subtract_i(deviation);

invstd.multiply_i(gamma);
dx.multiply_i(invstd);

if (dx.getDataType() !=
context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType())
context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(dx);
}

void BatchNormalizationLayer::calcGradient(RunLayerContext &context) {
/** dgamma is calculated in calcDerivative. dbeta is calculated here */
Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]);
const Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);

deriv.sum(axes_to_reduce, dbeta);
Tensor deriv32;
bool deriv_copyed = false;

const Tensor deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
if (deriv.getDataType() != TensorDim::DataType::FP32) {
deriv_copyed = true;
TensorDim dim = deriv.getDim();
dim.setDataType(TensorDim::DataType::FP32);
deriv32 = Tensor(dim, true);
deriv32.copyData(deriv);
}

(deriv_copyed ? deriv32 : deriv).sum(axes_to_reduce, dbeta);
}

void BatchNormalizationLayer::exportTo(
Expand Down
1 change: 1 addition & 0 deletions nntrainer/layers/layer_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ const Tensor RunLayerContext::getIncomingDerivative(unsigned int idx) const {
return getOutputGrad(idx);
}


/**
* @brief Get the Input tensor object
*
Expand Down
29 changes: 29 additions & 0 deletions nntrainer/layers/layer_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,35 @@ class InitLayerContext {
return weights_spec.size() - 1;
}

/**
* @brief Request a new weight for the layer
*
* @param dim dimension of Variable of the weight
* @param dim_g dimension of Gradient of the weight
* @param init initializer for the weight
* @param reg regularizer for the weight
* @param reg_const regularization constant for the weight
* @param name name of the weight
* @param trainable if the weight is trainable (require gradient or not)
* @return unsigned int index of the weight for its getter
*
* @todo Consider providing a guarantee that the returned indices will always
* start from 0 and will always be incremental.
*/
unsigned int requestWeight(const TensorDim &dim, const TensorDim &dim_g,
const Tensor::Initializer init,
const WeightRegularizer reg, const float reg_const,
const float decay, const std::string &name,
bool trainable = true, unsigned int out_axis = 3) {

/** @note : We assumes the gradient type is same with Activation data
* type.*/
weights_spec.emplace_back(dim, dim_g, init, reg, reg_const, decay,
clip_by_global_norm, trainable,
prefix + ":" + name, out_axis, loss_scale);
return weights_spec.size() - 1;
}

/**
* @brief Request a new weight for the layer
*
Expand Down
53 changes: 49 additions & 4 deletions nntrainer/layers/layer_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <utility>

#include <activation_layer.h>
#include <bn_layer.h>
#include <app_context.h>
#include <base_properties.h>
#include <common_properties.h>
Expand Down Expand Up @@ -460,9 +461,11 @@ void LayerNode::exportTo(Exporter &exporter,
layer->exportTo(exporter, method);
}

void LayerNode::read(std::ifstream &file, bool opt_var) {
void LayerNode::read(std::ifstream &file, bool opt_var,
ml::train::ExecutionMode mode) {
NNTR_THROW_IF(!run_context, std::runtime_error)
<< __func__ << " layer needs to be finalized first!";

if (opt_var) {
for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
if (run_context->isGradientLastAccess(i) && getTrainable()) {
Expand All @@ -473,10 +476,29 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
}
}
} else {

for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
/// @note shared weights are only be read at the first acecss
if (run_context->isGradientLastAccess(i)) {
run_context->getWeight(i).read(file);
if (layer->getType() == BatchNormalizationLayer::type) {
if ((mode == ml::train::ExecutionMode::TRAIN) &&
(this->getWeightDataType() != TensorDim::DataType::FP32)) {

/** @note for batch normalization layer, we do need full precision
* for training. but weight can be saved with other type. for
* training, bn weight type is fixed with full precsion */

TensorDim dim = run_context->getWeight(i).getDim();
dim.setDataType(this->getWeightDataType());
Tensor T_read(dim, true);
T_read.read(file);
run_context->getWeight(i).copyData(T_read);
} else {
run_context->getWeight(i).read(file);
}
} else {
run_context->getWeight(i).read(file);
}
if (run_context->isMixedPrecision(i) && getTrainable()) {
run_context->getWeightFP32(i).copyData(run_context->getWeight(i));
}
Expand All @@ -485,7 +507,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
}
}

void LayerNode::save(std::ofstream &file, bool opt_var) const {
void LayerNode::save(std::ofstream &file, bool opt_var,
ml::train::ExecutionMode mode) const {
NNTR_THROW_IF(!run_context, std::runtime_error)
<< __func__ << " layer needs to be finalized first!";

Expand All @@ -505,7 +528,29 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
// @note shared weights are only be saved at the first access
for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
if (run_context->isGradientLastAccess(i)) {
run_context->getWeight(i).save(file);

/** @note For batch normalization layer, we do need full precision for
* training and the data type of weight is full precision. But for
* inference, We do have to save them as activation data type. */

if (layer->getType() == BatchNormalizationLayer::type) {
if ((mode == ml::train::ExecutionMode::TRAIN) &&
(this->getWeightDataType() != TensorDim::DataType::FP32)) {
TensorDim dim = run_context->getWeight(i).getDim();

dim.setDataType(this->getWeightDataType());

Tensor T_save(dim, true);

T_save.copyData(run_context->getWeight(i));

T_save.save(file);
} else {
run_context->getWeight(i).save(file);
}
} else {
run_context->getWeight(i).save(file);
}
}
}
}
Expand Down
Loading

0 comments on commit a5b1545

Please sign in to comment.