Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ Loss Scale ] Enable to define different type of Variable and Gradient for Loss Scale #2562

Merged
merged 4 commits into from
May 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,

const auto &w_specs = init_context.getWeightsSpec();
for (auto i = 0u; i < w_specs.size(); ++i) {
shared_weight_names.emplace_back(std::get<7>(w_specs.at(i)));
shared_weight_names.emplace_back(std::get<8>(w_specs.at(i)));
}
}

Expand Down Expand Up @@ -1018,7 +1018,7 @@ NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,

const auto &w_specs = init_context.getWeightsSpec();
for (auto i = 0u; i < w_specs.size(); ++i) {
shared_weight_names.emplace_back(std::get<7>(w_specs.at(i)));
shared_weight_names.emplace_back(std::get<8>(w_specs.at(i)));
}
}

Expand Down
10 changes: 10 additions & 0 deletions nntrainer/layers/common_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -1367,6 +1367,16 @@ class ClipGradByGlobalNorm : public Property<float> {
using prop_tag = float_prop_tag; /**< property type */
};

/**
* @brief properties for getting the loss scale value to mixed precision
*
*/
class LossScaleForMixed : public Property<float> {
public:
static constexpr const char *key = "loss_scale"; /**< unique key to access */
using prop_tag = float_prop_tag; /**< property type */
};

/**
* @brief Learning Rate props
*
Expand Down
6 changes: 4 additions & 2 deletions nntrainer/layers/layer_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,17 @@ InitLayerContext::InitLayerContext(const std::vector<TensorDim> &dim,
bool in_place_, const std::string &n,
const std::string &prefix_,
const float max_norm,
std::array<std::string, 3> tensor_type_) :
std::array<std::string, 3> tensor_type_,
const float loss_scale_) :
input_dim(dim),
in_place(in_place_),
clip_by_global_norm(max_norm),
output_specs(),
req_out_is_connected(req_out_connected),
name(n),
prefix(prefix_),
tensor_type(tensor_type_) {
tensor_type(tensor_type_),
loss_scale(loss_scale_) {
NNTR_THROW_IF(!validate(), std::invalid_argument)
<< "Invalid init context name: " << name
<< " num inputs: " << getNumInputs();
Expand Down
17 changes: 13 additions & 4 deletions nntrainer/layers/layer_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ class InitLayerContext {
const std::string &n = "", const std::string &prefix_ = "",
const float max_norm = 0.0,
std::array<std::string, 3> tensor_type_ = {"NCHW", "FP32",
"FP32"});
"FP32"},
const float loss_scale = 0.0);
/**
* @brief get Tensor Format of Layer
*
Expand Down Expand Up @@ -171,7 +172,7 @@ class InitLayerContext {
/**
* @brief Request a new weight for the layer
*
* @param dim dimension of the weight
* @param dim dimension of Variable of the weight
* @param init initializer for the weight
* @param reg regularizer for the weight
* @param reg_const regularization constant for the weight
Expand All @@ -187,9 +188,16 @@ class InitLayerContext {
const WeightRegularizer reg, const float reg_const,
const float decay, const std::string &name,
bool trainable = true, unsigned int out_axis = 3) {
weights_spec.emplace_back(dim, init, reg, reg_const, decay,

/** @note : We assumes the gradient type is same with Activation data
* type.*/
TensorDim dim_g(dim);

dim_g.setDataType(getActivationDataType());

weights_spec.emplace_back(dim, dim_g, init, reg, reg_const, decay,
clip_by_global_norm, trainable,
prefix + ":" + name, out_axis);
prefix + ":" + name, out_axis, loss_scale);
return weights_spec.size() - 1;
}

Expand Down Expand Up @@ -356,6 +364,7 @@ class InitLayerContext {
std::string name; /**< name of the layer */
std::string prefix; /**< prefix of the layer */
std::array<std::string, 3> tensor_type;
float loss_scale; /**< loss_scale value */
};

/**
Expand Down
13 changes: 9 additions & 4 deletions nntrainer/layers/layer_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,10 @@ LayerNode::LayerNode(std::unique_ptr<nntrainer::Layer> &&l) :
needs_calc_gradient(false),
output_connections(),
run_context(nullptr),
layer_node_props(new PropsType(
props::Name(), props::Distribute(), props::Trainable(), {}, {},
props::SharedFrom(), props::ClipGradByGlobalNorm(), props::Packed())),
layer_node_props(
new PropsType(props::Name(), props::Distribute(), props::Trainable(), {},
{}, props::SharedFrom(), props::ClipGradByGlobalNorm(),
props::Packed(), props::LossScaleForMixed())),
layer_node_props_realization(
new RealizationPropsType(props::Flatten(), props::Activation())),
loss(new props::Loss()),
Expand Down Expand Up @@ -598,9 +599,13 @@ InitLayerContext LayerNode::finalize(const std::vector<TensorDim> &input_dims,

const auto &scope = getSharedFrom().empty() ? getName() : getSharedFrom();
float max_norm = 0.0;
float loss_scale = 0.0;
if (!std::get<props::ClipGradByGlobalNorm>(*layer_node_props).empty())
max_norm = std::get<props::ClipGradByGlobalNorm>(*layer_node_props).get();

if (!std::get<props::LossScaleForMixed>(*layer_node_props).empty())
loss_scale = std::get<props::LossScaleForMixed>(*layer_node_props).get();

if (!std::get<props::Packed>(*layer_node_props).empty()) {
bool isPacked = std::get<props::Packed>(*layer_node_props);
if (!isPacked) {
Expand All @@ -622,7 +627,7 @@ InitLayerContext LayerNode::finalize(const std::vector<TensorDim> &input_dims,

auto context = InitLayerContext(actual_input_dims, out_info,
executeInPlace() != InPlace::NONE, getName(),
scope, max_norm, tensor_type);
scope, max_norm, tensor_type, loss_scale);

layer->finalize(context);

Expand Down
11 changes: 6 additions & 5 deletions nntrainer/layers/layer_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class SharedFrom;
class InputConnection;
class ClipGradByGlobalNorm;
class Packed;
class LossScaleForMixed;
} // namespace props

/**
Expand Down Expand Up @@ -939,11 +940,11 @@ will also contain the properties of the layer. The properties will be copied
upon final creation. Editing properties of the layer after init will not the
properties in the context/graph unless intended. */

using PropsType =
std::tuple<props::Name, props::Distribute, props::Trainable,
std::vector<props::InputConnection>,
std::vector<props::InputShape>, props::SharedFrom,
props::ClipGradByGlobalNorm, props::Packed>;
using PropsType = std::tuple<props::Name, props::Distribute, props::Trainable,
std::vector<props::InputConnection>,
std::vector<props::InputShape>,
props::SharedFrom, props::ClipGradByGlobalNorm,
props::Packed, props::LossScaleForMixed>;

using RealizationPropsType = std::tuple<props::Flatten, props::Activation>;
/** these realization properties results in addition of new layers, hence
Expand Down
2 changes: 2 additions & 0 deletions nntrainer/models/model_common_properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,6 @@ MemorySwapLookahead::MemorySwapLookahead(const unsigned int &value) {
ModelTensorDataType::ModelTensorDataType(ModelTensorDataTypeInfo::Enum value) {
set(value);
}
LossScale::LossScale(float value) { set(value); }

} // namespace nntrainer::props
11 changes: 11 additions & 0 deletions nntrainer/models/model_common_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ class ModelTensorDataType final : public EnumProperty<ModelTensorDataTypeInfo> {
ModelTensorDataTypeInfo::Enum::W32A32);
};

/**
* @brief LossScale property, loss is scaled by this value
*
*/
class LossScale : public Property<float> {
public:
LossScale(float value = 0.0f);
static constexpr const char *key = "loss_scale"; /**< unique key to access */
using prop_tag = float_prop_tag; /**< property type */
};

} // namespace nntrainer::props

#endif
Expand Down
10 changes: 8 additions & 2 deletions nntrainer/models/neuralnet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@
namespace nntrainer {

NeuralNetwork::NeuralNetwork() :
model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm()),
model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm(),
props::LossScale()),
model_flex_props(
props::Epochs(), props::TrainingBatchSize(), props::SavePath(),
props::ContinueTrain(), props::SaveBestPath(), props::MemoryOptimization(),
Expand All @@ -83,7 +84,8 @@ NeuralNetwork::NeuralNetwork() :
}

NeuralNetwork::NeuralNetwork(AppContext app_context_) :
model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm()),
model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm(),
props::LossScale()),
model_flex_props(
props::Epochs(), props::TrainingBatchSize(), props::SavePath(),
props::ContinueTrain(), props::SaveBestPath(), props::MemoryOptimization(),
Expand Down Expand Up @@ -189,6 +191,9 @@ int NeuralNetwork::compile() {
!prop.empty()) {
node->setProperty({"clip_grad_by_norm=" + to_string(prop)});
}
if (auto &prop = std::get<props::LossScale>(model_props); !prop.empty()) {
node->setProperty({"loss_scale=" + to_string(prop)});
}
model_graph.addLayer(node);
}

Expand Down Expand Up @@ -1018,6 +1023,7 @@ int NeuralNetwork::train_run(

auto train_for_iteration =
[this, stop_cb, stop_user_data](RunStats &stat, DataBuffer &buffer) {
ml_loge("train for iteration");
forwarding(true, stop_cb, stop_user_data);
backwarding(iter++, stop_cb, stop_user_data);

Expand Down
59 changes: 31 additions & 28 deletions nntrainer/models/neuralnet.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,11 @@ class NeuralNetwork : public ml::train::Model {
/**
* @brief Forward Propagation of the neural network
*/
sharedConstTensors forwarding(bool training = true,
std::function<bool(void *userdata)> stop_cb =
[](void *user_data) { return false; },
void *user_data = nullptr);
sharedConstTensors forwarding(
bool training = true,
std::function<bool(void *userdata)> stop_cb =
[](void *user_data) { return false; },
void *user_data = nullptr);

/**
* @brief Forward Propagation of the neural network
Expand All @@ -239,12 +240,11 @@ class NeuralNetwork : public ml::train::Model {
/**
* @brief Incremental forward Propagation of the neural network
*/
sharedConstTensors
incremental_forwarding(unsigned int from, unsigned int to,
bool training = true,
std::function<bool(void *userdata)> stop_cb =
[](void *user_data) { return false; },
void *user_data = nullptr);
sharedConstTensors incremental_forwarding(
unsigned int from, unsigned int to, bool training = true,
std::function<bool(void *userdata)> stop_cb =
[](void *user_data) { return false; },
void *user_data = nullptr);

/**
* @brief Incremental forward Propagation of the neural network
Expand All @@ -261,10 +261,11 @@ class NeuralNetwork : public ml::train::Model {
* @brief Backward Propagation of the neural network
* @param[in] iteration Iteration Number for the optimizer
*/
void backwarding(int iteration,
std::function<bool(void *userdata)> stop_cb =
[](void *user_data) { return false; },
void *user_data = nullptr);
void backwarding(
int iteration,
std::function<bool(void *userdata)> stop_cb =
[](void *user_data) { return false; },
void *user_data = nullptr);

/**
* @copydoc Model::save(const std::string &file_path, ml::train::ModelFormat
Expand Down Expand Up @@ -329,13 +330,14 @@ class NeuralNetwork : public ml::train::Model {
* @retval #ML_ERROR_NONE Successful.
* @retval #ML_ERROR_INVALID_PARAMETER invalid parameter.
*/
int train(const std::vector<std::string> &values = {},
std::function<bool(void *)> stop_cb =
[](void *stop_user_data) { return false; },
void *stop_user_data = nullptr,
std::function<void(void *)> epoch_complete_cb =
[](void *epoch_user_data) { return false; },
void *epoch_user_data = nullptr) override;
int train(
const std::vector<std::string> &values = {},
std::function<bool(void *)> stop_cb =
[](void *stop_user_data) { return false; },
void *stop_user_data = nullptr,
std::function<void(void *)> epoch_complete_cb =
[](void *epoch_user_data) { return false; },
void *epoch_user_data = nullptr) override;

/**
* @brief Run NeuralNetwork inference
Expand Down Expand Up @@ -630,7 +632,8 @@ s * @retval shared_ptr<const Tensor>
props::TensorFormat, props::ModelTensorDataType>;
using RigidPropTypes =
std::tuple<props::LossType, std::vector<props::InputConnection>,
std::vector<props::LabelLayer>, props::ClipGradByGlobalNorm>;
std::vector<props::LabelLayer>, props::ClipGradByGlobalNorm,
props::LossScale>;

RigidPropTypes model_props; /**< model props */
FlexiblePropTypes model_flex_props; /**< model train props */
Expand Down Expand Up @@ -709,12 +712,12 @@ s * @retval shared_ptr<const Tensor>
* @retval #ML_ERROR_NONE Successful.
* @retval #ML_ERROR_INVALID_PARAMETER invalid parameter.
*/
int train_run(std::function<bool(void *)> stop_cb =
[](void *) { return false; },
void *user_data = nullptr,
std::function<void(void *)> epoch_complete_cb =
[](void *) { return false; },
void *data = nullptr);
int train_run(
std::function<bool(void *)> stop_cb = [](void *) { return false; },
void *user_data = nullptr,
std::function<void(void *)> epoch_complete_cb =
[](void *) { return false; },
void *data = nullptr);

/**
* @brief Swap function for the class
Expand Down
18 changes: 8 additions & 10 deletions nntrainer/tensor/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,7 @@

namespace nntrainer {
MMapedMemory::MMapedMemory(size_t size, bool allocate_fd_) :
fd(-1),
buf(nullptr),
buf_size(0),
allocate_fd(allocate_fd_) {
fd(-1), buf(nullptr), buf_size(0), allocate_fd(allocate_fd_) {

#ifndef __ANDROID__
if (allocate_fd) {
Expand Down Expand Up @@ -386,8 +383,9 @@ std::vector<Weight *> Manager::requestWeights(
size_t current_size = weights_v2.size();

for (unsigned int i = 0; i < weights_spec.size(); ++i) {
auto &[dim, t_initializer, w_reg, w_reg_const, decay, clip_by_global_norm,
need_gradient, name, axis] = weights_spec.at(i);
auto &[dim_v, dim_g, t_initializer, w_reg, w_reg_const, decay,
clip_by_global_norm, need_gradient, name, axis, loss_scale] =
weights_spec.at(i);

std::vector<unsigned int> var_exec_order;
for (auto order : default_var_exec_order) {
Expand Down Expand Up @@ -422,7 +420,7 @@ std::vector<Weight *> Manager::requestWeights(
/// shared_name is used and the orignal name is discarded
const auto &shared_name = shared_names.at(i);
/** case when shared names are given */
var = weight_pool.requestOrExtend(shared_name, dim, var_exec_order,
var = weight_pool.requestOrExtend(shared_name, dim_v, var_exec_order,
var_ls, t_initializer);

if (trainable && need_gradient) {
Expand All @@ -431,13 +429,13 @@ std::vector<Weight *> Manager::requestWeights(
* for each layer anymore and it is hard to overwritten.
*/
grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
dim, grad_exec_order, grad_ls,
dim_g, grad_exec_order, grad_ls,
Tensor::Initializer::ZEROS);
}
} else {
/** case requesting fresh weights */
var =
weight_pool.request(name, dim, var_exec_order, var_ls, t_initializer);
weight_pool.request(name, dim_v, var_exec_order, var_ls, t_initializer);

if (trainable && need_gradient) {
/** is_wgrad is the index which is true when it is the gradient tensor
Expand All @@ -447,7 +445,7 @@ std::vector<Weight *> Manager::requestWeights(
bool is_wgrad = true;
if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm))
is_wgrad = false;
grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim,
grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g,
grad_exec_order, grad_ls,
Tensor::Initializer::ZEROS, is_wgrad);
}
Expand Down
7 changes: 4 additions & 3 deletions nntrainer/tensor/tensor_wrap_specs.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ enum class TensorLifespan {
*
* @details The tuple values are dimension, initializer, regularizer,
* regularizer_constant, decay, clip gradient constant, need_gradient property,
* name and output axis of the tensor object.
* name, output axis of the tensor object and loss Scale Factor.
*/
typedef std::tuple<TensorDim, Tensor::Initializer, WeightRegularizer, float,
float, float, bool, const std::string, unsigned int>
typedef std::tuple<TensorDim, TensorDim, Tensor::Initializer, WeightRegularizer,
float, float, float, bool, const std::string, unsigned int,
float>
WeightSpec;

/**
Expand Down
Loading