Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Wait for #2607] [ Layer ] Mixed Precision support for BN Layer #2615

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Applications/KNN/jni/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ e = executable('knn_sample',
install_dir: application_install_dir
)

test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
2 changes: 1 addition & 1 deletion api/ccapi/include/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ class Model {
* @retval #ML_ERROR_NONE Successful.
* @retval #ML_ERROR_INVALID_PARAMETER invalid parameter.
*/
virtual int compile() = 0;
virtual int compile(ExecutionMode exec_mode_ = ExecutionMode::TRAIN) = 0;

/**
* @brief Initialize Network. This should be called after setting the
Expand Down
1 change: 1 addition & 0 deletions debian/nntrainer-dev.install
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
/usr/include/nntrainer/blas_interface.h
/usr/include/nntrainer/var_grad.h
/usr/include/nntrainer/weight.h
/usr/include/nntrainer/blas_avx.h
# todo: update dataset headers
/usr/include/nntrainer/databuffer.h
/usr/include/nntrainer/databuffer_factory.h
Expand Down
17 changes: 11 additions & 6 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,19 @@ warning_c_flags = [
'-Wno-error=varargs'
]

arch = host_machine.cpu_family()

if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
if get_option('platform') == 'tizen'
add_project_arguments(['-mavx2'], language: ['c','cpp'])
else
add_project_arguments(['-march=native'], language: ['c','cpp'])
endif
message('-march=native added for AVX hardware acceleration.')
endif

if get_option('enable-fp16')
arch = host_machine.cpu_family()
if get_option('platform') == 'android'
add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
extra_defines += '-DENABLE_FP16=1'
Expand Down Expand Up @@ -105,11 +115,6 @@ if get_option('enable-fp16')
if cc.version().version_compare('>=12.1.0')
message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
extra_defines += '-DENABLE_FP16=1'
if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
add_project_arguments(['-march=native'], language: ['c','cpp'])
message('-march=native added for AVX hardware acceleration.')
endif
else
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
endif
Expand Down
2 changes: 1 addition & 1 deletion meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ option('enable-fp16', type: 'boolean', value: false)
option('enable-cublas', type: 'boolean', value: false)
option('enable-openmp', type: 'boolean', value: true)
option('enable-neon', type: 'boolean', value: false)
option('enable-avx', type: 'boolean', value: false)
option('enable-avx', type: 'boolean', value: true)
option('enable-opencl', type: 'boolean', value: false)

# ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
Expand Down
9 changes: 9 additions & 0 deletions nntrainer/graph/graph_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const {
return Sorted.at(ith);
}

const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const {
return sorted_node_map.at(name);
}

void GraphCore::makeAdjacencyList(
std::vector<std::list<std::shared_ptr<GraphNode>>> &adj) {
/** initialize the adj list */
Expand Down Expand Up @@ -93,6 +97,11 @@ void GraphCore::topologicalSort() {

if (Sorted.size() != node_list.size())
throw std::runtime_error("Internal error in topologicalSort");
unsigned int idx = 0;
for (auto n : Sorted) {
sorted_node_map[n->getName()] = idx;
idx++;
}
}

const std::shared_ptr<GraphNode> &
Expand Down
8 changes: 8 additions & 0 deletions nntrainer/graph/graph_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ class GraphCore {
*/
const std::shared_ptr<GraphNode> &getSortedNode(unsigned int ith) const;

/**
* @brief getter of Sorted GraphNode index with name
* @param[in] layer name
* @ret index
*/
const unsigned int getSortedNodeIdx(const std::string &name) const;

/**
* @brief getter of GraphNode with node name
* @param[in] node name
Expand Down Expand Up @@ -252,6 +259,7 @@ class GraphCore {
std::vector<std::shared_ptr<GraphNode>>
node_list; /**< Unordered Node List */
std::unordered_map<std::string, int> node_map; /**< Unordered Node map */
std::unordered_map<std::string, int> sorted_node_map; /**< Unordered Node map */
std::vector<std::shared_ptr<GraphNode>> Sorted; /**< Ordered Node List */
bool sorted; /** if the node_list is sorted */

Expand Down
156 changes: 121 additions & 35 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ void NetworkGraph::applyGradients(
continue;
}

if (rc.isGradientClipByGlobalNorm(i)) {
if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
/**
* @note the weights whose gradient are to be clipped by global norm will
* be clipped at once at the end of iteration and applied then.
Expand Down Expand Up @@ -393,56 +393,113 @@ sharedConstTensors NetworkGraph::incremental_forwarding(
return out;
}

void NetworkGraph::backwarding(
bool NetworkGraph::backwarding(
int iteration,
std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &apply_grad_clip_op,
std::function<bool(void *userdata)> stop_cb, void *userdata) const {
std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
std::function<void(Weight &, int)> &lazy_apply_grad_op,
std::function<bool(void *userdata)> stop_cb, void *userdata) {
/**
* last layer backwarding is run out of this loop
*/
auto iter_begin = getBackwardingBeginIter();
auto iter_end = getBackwardingEndIter();
bool is_valid = true;

/// there is no layer to train, so backwarding is essentially noop
if (iter_begin == iter_end) {
return;
return true;
}

auto const &lptr_begin = (*iter_begin);
// graph_const_reverse_iterator
auto iter_ = iter_begin;

if (lptr_begin->requireLabel() == false)
throw std::runtime_error(
"Error: last layer does not accept label, we can't train");

for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
auto &ln = *iter;
for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
auto &ln = *iter_;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
backwarding_op(ln, iteration);
is_valid = backwarding_op(ln, iteration);
PROFILE_TIME_END(profile_keys.at(ln->getType()));

if (!is_valid) {
std::cout << ln->getName() << " : Gradient has NaN --> "
<< ln->getRunContext().getLossScale() << std::endl;
break;
}
}

/** perform clipping of the gradients by global norm if any */
if (clip_weights.empty())
return;
if (!is_valid) {
/** if has NaN
* 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
* 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
* 3. return false --> run backwarding again;
*/
float scale = (*iter_)->getRunContext().getLossScale();

NNTR_THROW_IF(scale == 1.0f, std::invalid_argument)
<< "Loss Scale Factor is 1.0f";

float s = scale > 1.5f ? scale * 0.5f : 1.0f;

/** calculate the global norm */
Tensor global_norm_t(
TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
float *global_norm_data = global_norm_t.getData();
for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
auto const &w = clip_weights[idx];
global_norm_data[idx] = w->getGradientNorm();
resetLossScale(s);

auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());

for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
auto &ln = *iter;
ln->needsOutputSetZero(true);
}

for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
auto &ln = *iter;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
forwarding_op(*iter, true);
PROFILE_TIME_END(profile_keys.at(ln->getType()));
}

return false;
}
float global_norm = global_norm_t.l2norm();
/** apply the gradient with the above global norm */
for (auto w : clip_weights) {
w->clipGradientByGlobalNorm(global_norm);

/** perform clipping of the gradients by global norm if any */
if (lazy_weights.empty())
return true;

if (is_clip_grad) {
/** calculate the global norm */
Tensor global_norm_t(
TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
float *global_norm_data = global_norm_t.getData();
for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
auto const &w = lazy_weights[idx];
global_norm_data[idx] = w->getGradientNorm();
}
float global_norm = global_norm_t.l2norm();
/** apply the gradient with the above global norm */
for (auto w : lazy_weights) {
w->clipGradientByGlobalNorm(global_norm);
}
}
/** apply the gradient with the above global norm */
for (auto w : clip_weights) {
apply_grad_clip_op(*w, iteration);
for (auto w : lazy_weights) {
lazy_apply_grad_op(*w, iteration);
}
nan_count++;

/** @todo : handle as property : growth_interval : default --> 2000 */

if (nan_count > 2000) {
float scale = (*iter_)->getRunContext().getLossScale();
/** @todo growth_factor : default --> 2.0 */
float s = scale * 2.0f;
resetLossScale(s);
nan_count = 0;
}

return true;
}

LayerNode *NetworkGraph::computeBackwardEnd() {
Expand Down Expand Up @@ -580,8 +637,15 @@ void NetworkGraph::addLayer(std::shared_ptr<LayerNode> layer) {

InPlace
NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
if (!lnode->supportInPlace())

if (!lnode->supportInPlace()) {
return InPlace::NONE;
}

if (lnode->getType() == InputLayer::type &&
!istrequal(getTensorType()[2], "FP32")) {
return InPlace::NONE;
}

/** layers which behave as a no-op - flatten */
auto no_op = [](const std::shared_ptr<LayerNode> &lnode) {
Expand Down Expand Up @@ -746,7 +810,7 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
[](const Var_Grad *vg) { return vg->getDim(); });

/** finalize the layer and get the final context */
auto init_context = lnode->finalize(input_dims, getTensorType());
auto init_context = lnode->finalize(input_dims, getTensorType(), exec_mode);

/**
* Request manager for either a pre-allocated output as input or a newly
Expand All @@ -768,9 +832,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
* node is going to be used with in-place optimizations.
*/
auto out_specs = init_context.getOutSpecs();

/// @note try move inplace control to finalize
bool shared_var = false, shared_grad = false;
if (lnode->executeInPlace() != InPlace::NONE) {
if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
for (unsigned int i = 0; i < out_specs.size(); ++i) {
auto &s = out_specs.at(i);
Expand Down Expand Up @@ -873,13 +938,17 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
}
}

lnode->setDataType(init_context.getWeightDataType(),
init_context.getActivationDataType());

lnode->configureRunContext(
// TODO: update weights spec for trainable based on layer trainable prop
tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
lnode->getTrainable(), shared_weight_names),
inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
lnode->getTrainable(), shared_tensor_names));
lnode->getTrainable(), shared_tensor_names),
init_context.getLossScale());

return outputs;
}
Expand Down Expand Up @@ -1027,7 +1096,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,
// TODO: update weights spec for trainable based on layer trainable prop
weights, inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
lnode->getTrainable(), shared_tensor_names));
lnode->getTrainable(), shared_tensor_names),
init_context.getLossScale());

return outputs;
}
Expand Down Expand Up @@ -1197,7 +1267,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
*/
if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access) ||
(rc.isGradientClipByGlobalNorm(i) &&
((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access))) {
rc.getWeightObject(i).setAsGradientLastAccess();
Expand Down Expand Up @@ -1287,11 +1357,19 @@ int NetworkGraph::initialize(ExecutionMode mode,

/** select weights which would require clipping of the gradients by global
* norm if any */
clip_weights = tensor_manager->getWeights([](const Weight *w) {
lazy_weights = tensor_manager->getWeights([](const Weight *w) {
return w->hasGradient() && w->isGradientLastAccess() &&
w->isGradientClipByGlobalNorm();
(w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
});

is_clip_grad = false;
for (auto w : lazy_weights) {
if (w->isGradientClipByGlobalNorm()) {
is_clip_grad = true;
break;
}
}

return ML_ERROR_NONE;
}

Expand Down Expand Up @@ -1556,10 +1634,18 @@ void NetworkGraph::requestOptimizerVariable(
const TensorDim &dim = w->getDim();
std::vector<TensorDim> dims = cb(dim);
w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
Tensor::Initializer::ZEROS));
}
}
}

void NetworkGraph::resetLossScale(float scale) {
for (auto iter = cbegin(); iter != cend(); iter++) {
auto &ln = *iter;
ln->getRunContext().setLossScale(scale);
}
}

} /* namespace nntrainer */
Loading
Loading