diff --git a/runtime/onert/backend/train/BackendContext.cc b/runtime/onert/backend/train/BackendContext.cc index 3e72b72d2ef..8d0d6f22f00 100644 --- a/runtime/onert/backend/train/BackendContext.cc +++ b/runtime/onert/backend/train/BackendContext.cc @@ -16,11 +16,13 @@ #include "BackendContext.h" +#include "ExtraTensorGenerator.h" #include "TensorBuilder.h" #include "KernelGenerator.h" #include "ops/BackPropInitializer.h" #include +#include #include #include @@ -229,6 +231,29 @@ FunctionMap BackendContext::genKernels() // fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); // } + ExtraTensorGenerator extra_tensor_gen(trainable_graph(), _tensor_builder, _tensor_registry); + + const auto &ops = trainable_graph()->operations(); + + for (auto &pair : ret) + { + auto &op_idx = pair.first; + auto &fn_seq = pair.second; + + const ir::IOperation *op = &ops.at(op_idx); + const auto trainable_op = dynamic_cast(op); + assert(trainable_op != nullptr); + + if (not trainable_op->isRequiredForBackward()) + continue; + + fn_seq->iterate([&](exec::train::ITrainableFunction &fn) { + extra_tensor_gen.register_tensors(op_idx, (&fn)->requestExtraTensors()); + }); + } + extra_tensor_gen.plan(); + extra_tensor_gen.allocate(); + return ret; } diff --git a/runtime/onert/backend/train/ExtraTensorGenerator.cc b/runtime/onert/backend/train/ExtraTensorGenerator.cc new file mode 100644 index 00000000000..0cbe8b8b1f9 --- /dev/null +++ b/runtime/onert/backend/train/ExtraTensorGenerator.cc @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ExtraTensorGenerator.h" + +#include "ExtraTensorIndex.h" + +#include +#include +#include + +namespace onert +{ +namespace backend +{ +namespace train +{ + +ExtraTensorGenerator::ExtraTensorGenerator(const ir::train::TrainableGraph *tgraph, + std::shared_ptr &tensor_builder, + std::shared_ptr &tensor_registry) + : _tgraph(tgraph), _tensor_builder(tensor_builder) +{ + _tensor_reg = std::dynamic_pointer_cast(tensor_registry); +} + +void ExtraTensorGenerator::register_tensors(ir::OperationIndex op_idx, ExtraTensorRequests &&reqs) +{ + // save request, _idx_to_reuqests will be used for memory planning + if (reqs.size() == 0) + return; + + // _idx_to_requests[op_idx] = reqs; + _idx_to_requests.insert({op_idx, reqs}); + auto &operations = _tgraph->operations(); + + for (size_t i = 0; i < reqs.size(); i++) + { + // register tensor + ExtraTensorIndex tensor_idx(op_idx, i); + _tensor_builder->registerExtraTensorInfo(tensor_idx, reqs[i].info); + + std::stringstream op_info; + op_info << op_idx << "_" << operations.at(op_idx).name(); + VERBOSE(ExtraTensorGenerator) << "register (idx:" << tensor_idx << ") requested from " + << op_info.str() << std::endl; + + // return registered tensor + auto generated_tensor = _tensor_reg->getExtraTensor(tensor_idx); + *reqs[i].address = generated_tensor; + } + return; +} + +void ExtraTensorGenerator::plan() +{ + // forwarding order + const auto f_order = _tgraph->topolSortOperations(); + for (const auto &op_index : f_order) + { + auto &reqs = _idx_to_requests[op_index]; + for (auto i = 0u; i < reqs.size(); ++i) + { + auto < = reqs[i].lifetime; + if (lt == ExtraTensorLifeTime::FORWARD_TO_BACKWARD) + _tensor_builder->notifyFirstUse(ExtraTensorIndex(op_index, i)); + } + } + + // backwarding order + const auto b_order = _tgraph->essentialBackwardOrder(); + for (const auto &op_index : b_order) + { + auto &reqs = _idx_to_requests[op_index]; + + for (auto i = 0u; i < reqs.size(); ++i) + { + auto < = reqs[i].lifetime; + if (lt == ExtraTensorLifeTime::BACKWARD) + _tensor_builder->notifyFirstUse(ExtraTensorIndex(op_index, i)); + } + + for (auto i = 0u; i < reqs.size(); ++i) + { + auto < = reqs[i].lifetime; + if (lt == ExtraTensorLifeTime::FORWARD_TO_BACKWARD || lt == ExtraTensorLifeTime::BACKWARD) + _tensor_builder->notifyLastUse(ExtraTensorIndex(op_index, i)); + } + } +} + +void ExtraTensorGenerator::allocate() { _tensor_builder->allocateExtra(); } + +} // namespace train +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/train/ExtraTensorGenerator.h b/runtime/onert/backend/train/ExtraTensorGenerator.h new file mode 100644 index 00000000000..c41a43eff4b --- /dev/null +++ b/runtime/onert/backend/train/ExtraTensorGenerator.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_EXTRA_TENSOR_GENERATOR_H__ +#define __ONERT_BACKEND_EXTRA_TENSOR_GENERATOR_H__ + +#include +#include +#include + +#include "TensorBuilder.h" + +namespace onert +{ +namespace backend +{ +namespace train +{ + +class ExtraTensorGenerator +{ +public: + ExtraTensorGenerator() = delete; + + ExtraTensorGenerator(const ir::train::TrainableGraph *tgraph, + std::shared_ptr &tensor_builder, + std::shared_ptr &tensor_registry); + +public: + // Since register is reserved keyword, use 'register_tensors' intead of 'register' + void register_tensors(ir::OperationIndex idx, ExtraTensorRequests &&requests); + void plan(); + void allocate(); + +private: + const ir::train::TrainableGraph *_tgraph; + std::shared_ptr _tensor_builder; + std::shared_ptr _tensor_reg; + std::unordered_map _idx_to_requests; +}; + +} // namespace train +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_EXTRA_TENSOR_GENERATOR_H__ diff --git a/runtime/onert/backend/train/ExtraTensorIndex.h b/runtime/onert/backend/train/ExtraTensorIndex.h new file mode 100644 index 00000000000..80cc904a177 --- /dev/null +++ b/runtime/onert/backend/train/ExtraTensorIndex.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRAIN_EXTRA_TENSOR_INDEX_H__ +#define __ONERT_BACKEND_TRAIN_EXTRA_TENSOR_INDEX_H__ + +#include + +namespace onert +{ +namespace backend +{ +namespace train +{ + +class ExtraTensorIndex +{ +public: + ExtraTensorIndex(ir::OperationIndex op, uint32_t sub) : op_index(op), sub_index(sub) {} + + ir::OperationIndex op_index; + uint32_t sub_index; + + bool operator==(const ExtraTensorIndex &other) const + { + return op_index == other.op_index && sub_index == other.sub_index; + } +}; + +inline std::ostream &operator<<(std::ostream &o, const ExtraTensorIndex &i) +{ + o << i.op_index; + o << "-" << i.sub_index; + return o; +} + +} // namespace train +} // namespace backend +} // namespace onert + +namespace std +{ + +template <> struct hash +{ + size_t operator()(const onert::backend::train::ExtraTensorIndex &index) const noexcept + { + const auto op_index = index.op_index; + const auto sub_index = index.sub_index; + + return (static_cast(op_index.value())) << 16 | static_cast(sub_index); + } +}; + +} // namespace std + +#endif // __ONERT_BACKEND_TRAIN_EXTRA_TENSOR_INDEX_H__ diff --git a/runtime/onert/backend/train/MemoryManager.cc b/runtime/onert/backend/train/MemoryManager.cc index 87cd15d55a8..36ff49b2d85 100644 --- a/runtime/onert/backend/train/MemoryManager.cc +++ b/runtime/onert/backend/train/MemoryManager.cc @@ -17,6 +17,7 @@ #include "MemoryManager.h" #include "MemoryPlannerFactory.h" +#include "ExtraTensorIndex.h" #include @@ -53,52 +54,60 @@ uint8_t *GradientMemoryManager::getOptVarBuffer(const ir::OperandIndex &ind, uin return _var_mem_alloc->base() + var_offset + mem_blk.offset; } -DisposableMemoryManager::DisposableMemoryManager() : _mem_planner{createMemoryPlanner()} +template +TrainMemoryManager::TrainMemoryManager() : _mem_planner{createMemoryPlanner()} { // DO NOTHING } -DisposableMemoryManager::DisposableMemoryManager(const std::string planner_id) +template +TrainMemoryManager::TrainMemoryManager(const std::string planner_id) : _mem_planner{createMemoryPlanner(planner_id)} { // DO NOTHING } -basic::IMemoryPlanner *DisposableMemoryManager::createMemoryPlanner() +template +basic::IMemoryPlanner *TrainMemoryManager::createMemoryPlanner() { auto planner_id = util::getConfigString(util::config::CPU_MEMORY_PLANNER); - return MemoryPlannerFactory::get().create(planner_id); + return MemoryPlannerFactory::get().create(planner_id); } -basic::IMemoryPlanner * -DisposableMemoryManager::createMemoryPlanner(const std::string planner_id) +template +basic::IMemoryPlanner * +TrainMemoryManager::createMemoryPlanner(const std::string planner_id) { - return MemoryPlannerFactory::get().create(planner_id); + return MemoryPlannerFactory::get().create(planner_id); } -void DisposableMemoryManager::claimPlan(const DisposableTensorIndex &ind, uint32_t size) +template void TrainMemoryManager::claimPlan(const Index &ind, uint32_t size) { _mem_planner->claim(ind, size); } -void DisposableMemoryManager::releasePlan(const DisposableTensorIndex &ind) +template void TrainMemoryManager::releasePlan(const Index &ind) { _mem_planner->release(ind); } -void DisposableMemoryManager::allocate(void) +template void TrainMemoryManager::allocate(void) { _mem_alloc = std::make_shared(_mem_planner->capacity()); assert(_mem_alloc->base()); } -uint8_t *DisposableMemoryManager::getBuffer(const DisposableTensorIndex &ind) const +template uint8_t *TrainMemoryManager::getBuffer(const Index &ind) const { assert(_mem_planner->memory_plans().find(ind) != _mem_planner->memory_plans().end()); const auto &mem_blk = _mem_planner->memory_plans().at(ind); return _mem_alloc->base() + mem_blk.offset; } +// Instatiation +template class TrainMemoryManager; +template class TrainMemoryManager; + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/MemoryManager.h b/runtime/onert/backend/train/MemoryManager.h index 987cf905100..f330d8073c8 100644 --- a/runtime/onert/backend/train/MemoryManager.h +++ b/runtime/onert/backend/train/MemoryManager.h @@ -20,6 +20,7 @@ #include #include "DisposableTensorIndex.h" +#include "ExtraTensorIndex.h" namespace onert { @@ -44,30 +45,34 @@ class GradientMemoryManager : public MemoryManager uint32_t _optim_vars_count; }; -class DisposableMemoryManager +// TODO: Find a better name +template class TrainMemoryManager { public: - DisposableMemoryManager(); - DisposableMemoryManager(const std::string planner_id); + TrainMemoryManager(); + TrainMemoryManager(const std::string planner_id); void allocate(void); - uint8_t *getBuffer(const DisposableTensorIndex &ind) const; + uint8_t *getBuffer(const Index &ind) const; void deallocate(void) { _mem_alloc->release(); } - void claimPlan(const DisposableTensorIndex &ind, uint32_t size); - void releasePlan(const DisposableTensorIndex &ind); + void claimPlan(const Index &ind, uint32_t size); + void releasePlan(const Index &ind); std::shared_ptr getMemAlloc() { return _mem_alloc; } private: - basic::IMemoryPlanner *createMemoryPlanner(); - basic::IMemoryPlanner *createMemoryPlanner(const std::string planner_id); + basic::IMemoryPlanner *createMemoryPlanner(); + basic::IMemoryPlanner *createMemoryPlanner(const std::string planner_id); private: - std::shared_ptr> _mem_planner; + std::shared_ptr> _mem_planner; std::shared_ptr _mem_alloc; }; +using DisposableMemoryManager = TrainMemoryManager; +using ExtraMemoryManager = TrainMemoryManager; + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/MemoryPlanner.cc b/runtime/onert/backend/train/MemoryPlanner.cc index ea385558e28..e60e9f99948 100644 --- a/runtime/onert/backend/train/MemoryPlanner.cc +++ b/runtime/onert/backend/train/MemoryPlanner.cc @@ -15,6 +15,8 @@ */ #include "MemoryPlanner.h" +#include "DisposableTensorIndex.h" +#include "ExtraTensorIndex.h" #include @@ -27,7 +29,7 @@ namespace backend namespace train { -void BumpPlanner::claim(const DisposableTensorIndex &ind, size_t size) +template void BumpPlanner::claim(const Index &ind, size_t size) { basic::Block blk{_capacity, size}; _mem_plans[ind] = blk; @@ -36,7 +38,7 @@ void BumpPlanner::claim(const DisposableTensorIndex &ind, size_t size) VERBOSE(BP_PLANNER) << "CLAIM(" << ind << "): " << blk.offset << ", " << blk.size << std::endl; } -void BumpPlanner::release(const DisposableTensorIndex &ind) +template void BumpPlanner::release(const Index &ind) { VERBOSE(BP_PLANNER) << "RELEASE(" << ind << "): " << "NOTHING does" << std::endl; @@ -56,7 +58,7 @@ void BumpPlanner::release(const DisposableTensorIndex &ind) // point in time, it means the place at the offset can be claimed. // 2. In the loop for _claim_table, we can assume the current claim_base_offset value is bigger than // the previous claim_base_offset. -void FirstFitPlanner::claim(const DisposableTensorIndex &ind, size_t size) +template void FirstFitPlanner::claim(const Index &ind, size_t size) { // Find the right position for claiming uint32_t next_offset = 0; @@ -88,7 +90,7 @@ void FirstFitPlanner::claim(const DisposableTensorIndex &ind, size_t size) } } -void FirstFitPlanner::release(const DisposableTensorIndex &ind) +template void FirstFitPlanner::release(const Index &ind) { for (auto it = _claim_table.cbegin(); it != _claim_table.cend(); ++it) { @@ -107,14 +109,15 @@ void FirstFitPlanner::release(const DisposableTensorIndex &ind) assert(!"Cannot release for given index. It has been not claimed or released already."); } -WICPlanner::WICPlanner() +template +WICPlanner::WICPlanner() : _initialized(false), _capacity(0), _mem_plans(), _live_indices(), _interference_graph(), _indices() { // DO NOTHING } -void WICPlanner::claim(const DisposableTensorIndex &ind, size_t size) +template void WICPlanner::claim(const Index &ind, size_t size) { _indices.emplace(size, ind); _interference_graph[ind].insert(_interference_graph[ind].end(), _live_indices.cbegin(), @@ -128,7 +131,7 @@ void WICPlanner::claim(const DisposableTensorIndex &ind, size_t size) VERBOSE(WIC_PLANNER) << "claim(" << ind << "): [" << size << "sz]" << std::endl; } -void WICPlanner::release(const DisposableTensorIndex &ind) +template void WICPlanner::release(const Index &ind) { _live_indices.erase(ind); VERBOSE(WIC_PLANNER) << "release(" << ind << ")" << std::endl; @@ -143,7 +146,7 @@ void WICPlanner::release(const DisposableTensorIndex &ind) * 3. Allocate memory block for sorted operands * - Find free memory block which does not overlap with interfered operands */ -void WICPlanner::buildMemoryPlans() +template void WICPlanner::buildMemoryPlans() { for (const auto &[size, ind] : _indices) { @@ -194,13 +197,22 @@ void WICPlanner::buildMemoryPlans() _indices.clear(); } -std::unordered_map &WICPlanner::memory_plans() +template typename WICPlanner::MemoryPlans &WICPlanner::memory_plans() { if (!_initialized) buildMemoryPlans(); return _mem_plans; } +template class BumpPlanner; +template class BumpPlanner; + +template class FirstFitPlanner; +template class FirstFitPlanner; + +template class WICPlanner; +template class WICPlanner; + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/MemoryPlanner.h b/runtime/onert/backend/train/MemoryPlanner.h index 181dd5e6979..5e3f48e02f8 100644 --- a/runtime/onert/backend/train/MemoryPlanner.h +++ b/runtime/onert/backend/train/MemoryPlanner.h @@ -24,13 +24,14 @@ #include -#include "DisposableTensorIndex.h" - #include #include #include #include +#include "DisposableTensorIndex.h" +#include "ExtraTensorIndex.h" + namespace onert { namespace backend @@ -41,20 +42,22 @@ namespace train /** * @brief Class to plan memory by bump way */ -class BumpPlanner : public basic::IMemoryPlanner +template class BumpPlanner : public basic::IMemoryPlanner { + using MemoryPlans = typename basic::IMemoryPlanner::MemoryPlans; + public: /** * @brief Claim memory for tensor by bump way * @param[in] index The tensor index * @param[in] size The size of the memory */ - void claim(const DisposableTensorIndex &, size_t) override; + void claim(const Index &, size_t) override; /** * @brief Release memory for tensor by bump way * @param[in] index The tensor index */ - void release(const DisposableTensorIndex &) override; + void release(const Index &) override; /** * @brief Get capacity for memory planning * @return The value of capacity @@ -74,20 +77,22 @@ class BumpPlanner : public basic::IMemoryPlanner /** * @brief Class to plan memory by firstfit way */ -class FirstFitPlanner : public basic::IMemoryPlanner +template class FirstFitPlanner : public basic::IMemoryPlanner { + using MemoryPlans = typename basic::IMemoryPlanner::MemoryPlans; + public: /** * @brief Claim memory for tensor by firstfit way * @param[in] index The tensor index * @param[in] size The size of the memory */ - void claim(const DisposableTensorIndex &, size_t) override; + void claim(const Index &, size_t) override; /** * @brief Release memory for tensor by firstfit way * @param[in] index The tensor index */ - void release(const DisposableTensorIndex &) override; + void release(const Index &) override; /** * @brief Get capacity for memory planning * @return The value of capacity @@ -103,14 +108,17 @@ class FirstFitPlanner : public basic::IMemoryPlanner uint32_t _capacity = 0; MemoryPlans _mem_plans; // Use std::map because claim() assumes that _claim_table is sorted by uint32_t(base_offset) - std::map _claim_table; + std::map _claim_table; }; /** * @brief Class to plan memory by Weighted Interval Color algorithm */ -class WICPlanner : public basic::IMemoryPlanner +template class WICPlanner : public basic::IMemoryPlanner { +public: + using MemoryPlans = typename basic::IMemoryPlanner::MemoryPlans; + public: WICPlanner(); @@ -119,12 +127,12 @@ class WICPlanner : public basic::IMemoryPlanner * @param[in] index The tensor index * @param[in] size The size of the memory */ - void claim(const DisposableTensorIndex &, size_t) override; + void claim(const Index &, size_t) override; /** * @brief Release memory for tensor by WIC algorithm * @param[in] index The tensor index */ - void release(const DisposableTensorIndex &) override; + void release(const Index &) override; /** * @brief Get capacity for memory planning * @return The value of capacity @@ -147,10 +155,10 @@ class WICPlanner : public basic::IMemoryPlanner bool _initialized; uint32_t _capacity; MemoryPlans _mem_plans; - std::unordered_set _live_indices; - DisposableTensorIndexMap> _interference_graph; + std::unordered_set _live_indices; + std::unordered_map> _interference_graph; // Sort tensors by descending order of size - std::multimap> _indices; + std::multimap> _indices; }; } // namespace train diff --git a/runtime/onert/backend/train/MemoryPlanner.test.cc b/runtime/onert/backend/train/MemoryPlanner.test.cc index 8978607706f..15b9dc15693 100644 --- a/runtime/onert/backend/train/MemoryPlanner.test.cc +++ b/runtime/onert/backend/train/MemoryPlanner.test.cc @@ -16,6 +16,7 @@ #include +#include "DisposableTensorIndex.h" #include "MemoryPlanner.h" #include "ir/Index.h" @@ -25,7 +26,7 @@ using onert::ir::OperationIndex; TEST(BumpPlanner, claim_test) { - BumpPlanner planner; + BumpPlanner planner; auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size, uint32_t expected_offset) { @@ -55,7 +56,7 @@ TEST(BumpPlanner, claim_test) TEST(FirstFitPlanner, claim_release_test) { - FirstFitPlanner planner; + FirstFitPlanner planner; auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size, uint32_t expected_offset) { @@ -148,7 +149,7 @@ TEST(FirstFitPlanner, claim_release_test) TEST(FirstFitPlanner, neg_release_non_existing_index) { - FirstFitPlanner planner; + FirstFitPlanner planner; auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size, uint32_t expected_offset) { @@ -184,7 +185,7 @@ TEST(FirstFitPlanner, neg_release_non_existing_index) TEST(FirstFitPlanner, neg_release_twice) { - FirstFitPlanner planner; + FirstFitPlanner planner; auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size, uint32_t expected_offset) { @@ -223,7 +224,7 @@ TEST(FirstFitPlanner, neg_release_twice) TEST(WICPlanner, claim_release_test) { - WICPlanner planner; + WICPlanner planner; auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size) { DisposableTensorIndex mem_idx{OperationIndex{op_index}, OperandIndex{operand_index}}; diff --git a/runtime/onert/backend/train/MemoryPlannerFactory.cc b/runtime/onert/backend/train/MemoryPlannerFactory.cc index acfa44e3511..e1e80119213 100644 --- a/runtime/onert/backend/train/MemoryPlannerFactory.cc +++ b/runtime/onert/backend/train/MemoryPlannerFactory.cc @@ -16,6 +16,8 @@ #include "MemoryPlannerFactory.h" +#include "DisposableTensorIndex.h" +#include "ExtraTensorIndex.h" namespace onert { namespace backend @@ -23,29 +25,47 @@ namespace backend namespace train { -MemoryPlannerFactory &MemoryPlannerFactory::get() +template MemoryPlannerFactory &MemoryPlannerFactory::get() { - static MemoryPlannerFactory instance; + static MemoryPlannerFactory instance; return instance; } -basic::IMemoryPlanner *MemoryPlannerFactory::create(const std::string &key) +template +basic::IMemoryPlanner *MemoryPlannerFactory::create(const std::string &key) { if (key == "FirstFit") { - return new FirstFitPlanner; + return new FirstFitPlanner(); } else if (key == "Bump") { - return new BumpPlanner; + return new BumpPlanner(); } else if (key == "WIC") { - return new WICPlanner; + return new WICPlanner(); } - return new FirstFitPlanner; // Default Planner + return new FirstFitPlanner(); // Default Planner } +// is this necessary? +/** +/usr/bin/ld: libbackend_train.so: undefined reference to +`onert::backend::train::MemoryPlannerFactory::create(std::__cxx11::basic_string, std::allocator > const&)' /usr/bin/ld: libbackend_train.so: undefined +reference to +`onert::backend::train::MemoryPlannerFactory::create(std::__cxx11::basic_string, std::allocator > const&)' /usr/bin/ld: libbackend_train.so: undefined +reference to +`onert::backend::train::MemoryPlannerFactory::get()' +/usr/bin/ld: libbackend_train.so: undefined reference to +`onert::backend::train::MemoryPlannerFactory::get()' +collect2: error: ld returned 1 exit status + */ +template class MemoryPlannerFactory; +template class MemoryPlannerFactory; + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/MemoryPlannerFactory.h b/runtime/onert/backend/train/MemoryPlannerFactory.h index d1609e17559..7f42be46f90 100644 --- a/runtime/onert/backend/train/MemoryPlannerFactory.h +++ b/runtime/onert/backend/train/MemoryPlannerFactory.h @@ -28,7 +28,7 @@ namespace backend namespace train { -class MemoryPlannerFactory +template class MemoryPlannerFactory { public: static MemoryPlannerFactory &get(); @@ -38,7 +38,7 @@ class MemoryPlannerFactory public: // Currently, only the memory planner for DisposableTensor is supported - basic::IMemoryPlanner *create(const std::string &key); + basic::IMemoryPlanner *create(const std::string &key); }; } // namespace train diff --git a/runtime/onert/backend/train/TensorBuilder.cc b/runtime/onert/backend/train/TensorBuilder.cc index 80452858057..115cb1e535d 100644 --- a/runtime/onert/backend/train/TensorBuilder.cc +++ b/runtime/onert/backend/train/TensorBuilder.cc @@ -18,6 +18,8 @@ #include "Tensor.h" +#include + namespace onert { namespace backend @@ -97,6 +99,15 @@ void TensorBuilder::registerDisposableBackwardTensorInfo(const DisposableTensorI _disposable_backprops.add(index); } +void TensorBuilder::registerExtraTensorInfo(const ExtraTensorIndex &index, + const ir::OperandInfo &info) +{ + assert(!info.isDynamic()); + + auto extra_tensor = std::make_unique(info); + _tensor_reg->setExtraTensor(index, std::move(extra_tensor)); +} + void TensorBuilder::notifyFirstUse(const ir::OperandIndex &index) { // TODO Support momory plan @@ -157,6 +168,16 @@ void TensorBuilder::notifyDisposableBackPropLastUse(const DisposableTensorIndex _tensor_mgr->releaseDisposableBackPropPlan(index); } +void TensorBuilder::notifyFirstUse(const ExtraTensorIndex &index) +{ + _tensor_mgr->claimExtraPlan(index); +} + +void TensorBuilder::notifyLastUse(const ExtraTensorIndex &index) +{ + _tensor_mgr->releaseExtraPlan(index); +} + bool TensorBuilder::isRegistered(const ir::OperandIndex &index) const { return _tensor_info_map.find(index) != _tensor_info_map.end(); @@ -185,6 +206,8 @@ void TensorBuilder::allocateBackward(void) _tensor_mgr->allocateDisposableBackPropTensors(); } +void TensorBuilder::allocateExtra(void) { _tensor_mgr->allocateExtraTensors(); } + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/TensorBuilder.h b/runtime/onert/backend/train/TensorBuilder.h index f6ffbbb0e20..e23e185a047 100644 --- a/runtime/onert/backend/train/TensorBuilder.h +++ b/runtime/onert/backend/train/TensorBuilder.h @@ -18,6 +18,7 @@ #define __ONERT_BACKEND_TRAIN_TENSOR_BUILDER_H__ #include "DisposableTensorIndex.h" +#include "ExtraTensorIndex.h" #include "TensorManager.h" #include "TensorRegistry.h" #include "util/Set.h" @@ -55,6 +56,8 @@ class TensorBuilder void registerDisposableBackwardTensorInfo(const DisposableTensorIndex &index, const ir::OperandInfo &info); + void registerExtraTensorInfo(const ExtraTensorIndex &index, const ir::OperandInfo &info); + // TODO Support memory plan of all tensors void notifyFirstUse(const ir::OperandIndex &); void notifyLastUse(const ir::OperandIndex &); @@ -62,6 +65,8 @@ class TensorBuilder void notifyBackwardLastUse(const ir::OperandIndex &); void notifyDisposableBackPropFirstUse(const DisposableTensorIndex &); void notifyDisposableBackPropLastUse(const DisposableTensorIndex &); + void notifyFirstUse(const ExtraTensorIndex &); + void notifyLastUse(const ExtraTensorIndex &); bool isRegistered(const ir::OperandIndex &) const; bool isRegisteredBackward(const ir::OperandIndex &) const; @@ -69,6 +74,7 @@ class TensorBuilder void allocate(void); void allocateBackward(void); + void allocateExtra(void); // < this function will be called after genKernels private: const std::shared_ptr _tensor_reg; diff --git a/runtime/onert/backend/train/TensorManager.cc b/runtime/onert/backend/train/TensorManager.cc index cf5373fae74..6ee643bf081 100644 --- a/runtime/onert/backend/train/TensorManager.cc +++ b/runtime/onert/backend/train/TensorManager.cc @@ -59,7 +59,8 @@ TensorManager::TensorManager(const std::shared_ptr ®, _back_prop_mgr{new MemoryManager(planner_id)}, _gradient_mgr{new GradientMemoryManager(planner_id, optim_vars_count)}, // TODO Find a suitable planner of disposable tensors to reduce peak memory usage - _disposable_back_prop_mgr{new DisposableMemoryManager(std::string("WIC"))}, _tensors{reg} + _disposable_back_prop_mgr{new DisposableMemoryManager(std::string("WIC"))}, + _extra_mgr{new ExtraMemoryManager(std::string("WIC"))}, _tensors{reg} { // DO NOTHING } @@ -118,6 +119,11 @@ void TensorManager::claimNonConstPlan(const ir::OperandIndex &index) _nonconst_mgr->claimPlan(index, size); } +void TensorManager::allocateExtraTensors() +{ + allocateMemory(_extra_mgr.get(), _tensors->extra_tensors(), std::string{"EXTRA TENSOR "}); +} + void TensorManager::releaseNonConstPlan(const ir::OperandIndex &index) { assert(_tensors->getNonConstTensor(index) && !_tensors->getNonConstTensor(index)->is_dynamic()); @@ -190,6 +196,19 @@ void TensorManager::releaseDisposableBackPropPlan(const DisposableTensorIndex &i _disposable_back_prop_mgr->releasePlan(index); } +void TensorManager::claimExtraPlan(const ExtraTensorIndex &index) +{ + const auto tensor = _tensors->getExtraTensor(index); + + auto size = alignedSize(tensor->total_size(), _align); + _extra_mgr->claimPlan(index, size); +} + +void TensorManager::releaseExtraPlan(const ExtraTensorIndex &index) +{ + _extra_mgr->releasePlan(index); +} + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/TensorManager.h b/runtime/onert/backend/train/TensorManager.h index f8d29b16e1d..99bedb0781a 100644 --- a/runtime/onert/backend/train/TensorManager.h +++ b/runtime/onert/backend/train/TensorManager.h @@ -18,6 +18,7 @@ #define __ONERT_BACKEND_TRAIN_TENSOR_MANAGER_H__ #include "DisposableTensorIndex.h" +#include "ExtraTensorIndex.h" #include "MemoryManager.h" #include "TensorRegistry.h" @@ -50,6 +51,8 @@ class TensorManager void allocateBackPropTensors(); void allocateGradientTensors(); void allocateDisposableBackPropTensors(); + void allocateExtraTensors(); + // TODO Add member functions to deallocate tensors void claimNonConstPlan(const ir::OperandIndex &ind); @@ -62,6 +65,8 @@ class TensorManager void releaseGradientPlan(const ir::OperandIndex &ind); void claimDisposableBackPropPlan(const DisposableTensorIndex &ind); void releaseDisposableBackPropPlan(const DisposableTensorIndex &ind); + void claimExtraPlan(const ExtraTensorIndex &ind); + void releaseExtraPlan(const ExtraTensorIndex &ind); private: std::unique_ptr _nonconst_mgr; @@ -69,6 +74,7 @@ class TensorManager std::unique_ptr _back_prop_mgr; std::unique_ptr _gradient_mgr; std::unique_ptr _disposable_back_prop_mgr; + std::unique_ptr _extra_mgr; const std::shared_ptr _tensors; }; diff --git a/runtime/onert/backend/train/TensorRegistry.h b/runtime/onert/backend/train/TensorRegistry.h index 13932199a9d..7dd56d53877 100644 --- a/runtime/onert/backend/train/TensorRegistry.h +++ b/runtime/onert/backend/train/TensorRegistry.h @@ -18,6 +18,7 @@ #define __ONERT_BACKEND_TRAIN_TENSOR_REGISTRY__ #include +#include #include "DisposableTensorIndex.h" #include "Tensor.h" @@ -60,9 +61,35 @@ class TensorRegistry return _disposable_back_prop; } + ExtraTensor *getExtraTensor(const ExtraTensorIndex &index) + { + auto itr = _extra.find(index); + if (itr != _extra.end()) + return itr->second.get(); + + return nullptr; + } + + void setExtraTensor(const ExtraTensorIndex &index, std::unique_ptr tensor) + { + assert(tensor != nullptr); + auto itr = _extra.find(index); + if (itr != _extra.end()) + throw std::runtime_error{ + "Tried to set a extra tensor but another extra tensor already exists."}; + + _extra[index] = std::move(tensor); + } + + const std::unordered_map> &extra_tensors() + { + return _extra; + } + private: // Disposable Tensors to be accumulated to BackPropTensor std::unordered_map> _disposable_back_prop; + std::unordered_map> _extra; }; } // namespace train diff --git a/runtime/onert/backend/train/ops/BinaryArithmeticLayer.cc b/runtime/onert/backend/train/ops/BinaryArithmeticLayer.cc index 3c4ce2f7ce1..f9518dbaca6 100644 --- a/runtime/onert/backend/train/ops/BinaryArithmeticLayer.cc +++ b/runtime/onert/backend/train/ops/BinaryArithmeticLayer.cc @@ -55,11 +55,19 @@ void BinaryArithmeticLayer::configureBackward(IPortableTensor *back_prop_lhs, if (activation != ir::Activation::NONE) { - _act_back_prop_output = std::make_unique(_output->get_info()); - _act_back_prop_output->setBuffer(std::make_shared(_output->total_size())); } } +ExtraTensorRequests BinaryArithmeticLayer::requestExtraTensors() +{ + ExtraTensorRequests req; + + if (_activation != ir::Activation::NONE) + req.push_back(ExtraTensorRequest::createLike(_back_prop_output, &_act_back_prop_output)); + + return req; +} + void BinaryArithmeticLayer::forward(bool) { cpu::ops::BinaryArithmeticLayer::run(); } void BinaryArithmeticLayer::backward() @@ -72,7 +80,7 @@ void BinaryArithmeticLayer::backward() try { backprop_act = - backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output.get()); + backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output); } catch (const std::exception &e) { diff --git a/runtime/onert/backend/train/ops/BinaryArithmeticLayer.h b/runtime/onert/backend/train/ops/BinaryArithmeticLayer.h index 60d6e8be1cc..42e1b4327a5 100644 --- a/runtime/onert/backend/train/ops/BinaryArithmeticLayer.h +++ b/runtime/onert/backend/train/ops/BinaryArithmeticLayer.h @@ -50,6 +50,7 @@ class BinaryArithmeticLayer : public ::onert::exec::train::ITrainableFunction, void configureBackward(IPortableTensor *back_prop_lhs, IPortableTensor *back_prop_rhs, const IPortableTensor *back_prop_output, const ir::Activation activation, const ArithmeticType arithmetic_type); + ExtraTensorRequests requestExtraTensors() override; void forward(bool training) override; void backward() override; @@ -60,7 +61,7 @@ class BinaryArithmeticLayer : public ::onert::exec::train::ITrainableFunction, ArithmeticType _arithmetic_type; ir::Activation _activation; - std::unique_ptr _act_back_prop_output; + ExtraTensor *_act_back_prop_output; }; } // namespace ops diff --git a/runtime/onert/backend/train/ops/ConvolutionLayer.cc b/runtime/onert/backend/train/ops/ConvolutionLayer.cc index 41ff7fd1c43..698164f4b62 100644 --- a/runtime/onert/backend/train/ops/ConvolutionLayer.cc +++ b/runtime/onert/backend/train/ops/ConvolutionLayer.cc @@ -30,6 +30,7 @@ namespace using namespace onert; +/* template std::unique_ptr createTransposedWeights(const backend::IPortableTensor *origin_weights) { @@ -44,6 +45,28 @@ std::unique_ptr createTransposedWeights(const backend::IPortableTensor * return std::make_unique(transposed_info); } +*/ + +ir::OperandInfo transposeOperandInfo(const ir::OperandInfo &origin_info) +{ + const auto &origin_shape = origin_info.shape(); + assert(origin_shape.rank() == 4); + + auto transposed_info = ir::OperandInfo(origin_info); + auto transposed_shape = + ir::Shape{origin_shape.dim(1), origin_shape.dim(2), origin_shape.dim(3), origin_shape.dim(0)}; + transposed_info.shape(transposed_shape); + + return transposed_info; +} + +backend::train::ExtraTensorRequest +createTransposeTenosrRequest(const backend::IPortableTensor *origin, + backend::train::ExtraTensor **const addr) +{ + return backend::train::ExtraTensorRequest(transposeOperandInfo(origin->get_info()), + backend::train::ExtraTensorLifeTime::BACKWARD, addr); +} } // namespace @@ -79,27 +102,36 @@ void ConvolutionLayer::configureBackward(const IPortableTensor *weights, if (_dilationHeightFactor != 1 || _dilationWidthFactor != 1) throw std::runtime_error("train ConvolutionLayer: Unsupported dilation yet"); - // TODO Optimize transposed tensors - _transposed_weights = createTransposedWeights(weights); - _transposed_weights->setBuffer( - std::make_shared(_transposed_weights->total_size())); - - _conv_back_prop_output = std::make_unique(back_prop_output->get_info()); - _conv_back_prop_output->setBuffer( - std::make_shared(_conv_back_prop_output->total_size())); - - _transposed_grad_weights = createTransposedWeights(weights); - _transposed_grad_weights->setBuffer( - std::make_shared(_transposed_grad_weights->total_size())); + // TO avoid unused parameter error + if (weights == nullptr) + { + }; if (activation != ir::Activation::NONE) { - _act_back_prop_output = std::make_unique(_back_prop_output->get_info()); - _act_back_prop_output->setBuffer( - std::make_shared(_act_back_prop_output->total_size())); } } +ExtraTensorRequests ConvolutionLayer::requestExtraTensors() +{ + ExtraTensorRequests reqs; + + auto tr_weights = createTransposeTenosrRequest(_kernel, &_transposed_weights); + reqs.push_back(tr_weights); + + auto conv_back_prop_output = + ExtraTensorRequest::createLike(_back_prop_output, &_conv_back_prop_output); + reqs.push_back(conv_back_prop_output); + + auto tr_grad_weights = createTransposeTenosrRequest(_grad_weights, &_transposed_grad_weights); + reqs.push_back(tr_grad_weights); + + if (_activation != ir::Activation::NONE) + reqs.push_back(ExtraTensorRequest::createLike(_back_prop_output, &_act_back_prop_output)); + + return reqs; +} + void ConvolutionLayer::forward(bool) { cpu::ops::ConvolutionLayer::run(); } void ConvolutionLayer::backward() { @@ -125,7 +157,7 @@ void ConvolutionLayer::backwardFloat32() try { backprop_act = - backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output.get()); + backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output); } catch (const std::exception &e) { @@ -144,7 +176,7 @@ void ConvolutionLayer::backwardFloat32() conv_train_params.dilation_height_factor = _dilationHeightFactor; // Transpose weights from OHWI to HWIO - auto transposed_weights = _transposed_weights.get(); + auto transposed_weights = _transposed_weights; assert(transposed_weights->getShape().rank() == 4); nnfw::cker::TransposeParams transpose_param; transpose_param.perm_count = transposed_weights->getShape().rank(); @@ -162,7 +194,7 @@ void ConvolutionLayer::backwardFloat32() _paddingRight, getShape(_back_prop_input), getBuffer(_back_prop_input)); // Calculate gradient for weights - auto transposed_grad_weights = _transposed_grad_weights.get(); + auto transposed_grad_weights = _transposed_grad_weights; assert(_grad_weights->getShape().rank() == 4); assert(transposed_grad_weights->getShape().rank() == 4); nnfw::cker::train::ConvFilterGrad( diff --git a/runtime/onert/backend/train/ops/ConvolutionLayer.h b/runtime/onert/backend/train/ops/ConvolutionLayer.h index ef11f68bf57..231e755dd86 100644 --- a/runtime/onert/backend/train/ops/ConvolutionLayer.h +++ b/runtime/onert/backend/train/ops/ConvolutionLayer.h @@ -41,6 +41,7 @@ class ConvolutionLayer : public ::onert::exec::train::ITrainableFunction, void configureBackward(const IPortableTensor *weights, IPortableTensor *back_prop_input, IPortableTensor *grad_weights, IPortableTensor *grad_bias, const IPortableTensor *back_prop_output, const ir::Activation activation); + ExtraTensorRequests requestExtraTensors() override; void forward(bool training) override; void backward() override; @@ -54,10 +55,10 @@ class ConvolutionLayer : public ::onert::exec::train::ITrainableFunction, const IPortableTensor *_back_prop_output; // TODO Consider if these tensors should be built in TensorBuilder - std::unique_ptr _transposed_weights; - std::unique_ptr _conv_back_prop_output; - std::unique_ptr _act_back_prop_output; - std::unique_ptr _transposed_grad_weights; + ExtraTensor *_transposed_weights; + ExtraTensor *_conv_back_prop_output; + ExtraTensor *_transposed_grad_weights; + ExtraTensor *_act_back_prop_output; }; } // namespace ops diff --git a/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.cc index 9443d0fe0ea..901e9b792f6 100644 --- a/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.cc +++ b/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.cc @@ -56,9 +56,11 @@ void DepthwiseConvolutionLayer::configureBackward(IPortableTensor *back_prop_inp if (activation != ir::Activation::NONE) { + /* _act_back_prop_output = std::make_unique(_back_prop_output->get_info()); _act_back_prop_output->setBuffer( std::make_shared(_act_back_prop_output->total_size())); + */ } const int64_t k_packet_size = [&]() { @@ -75,20 +77,20 @@ void DepthwiseConvolutionLayer::configureBackward(IPortableTensor *back_prop_inp }(); const auto incoming_shape = getShape(_back_prop_output); - const auto filter_shape = getShape(_kernel); - const int batch = incoming_shape.Dims(0); + // const auto filter_shape = getShape(_kernel); + // const int batch = incoming_shape.Dims(0); const int out_depth = incoming_shape.Dims(3); - const int filter_rows = filter_shape.Dims(1); - const int filter_cols = filter_shape.Dims(2); + // const int filter_rows = filter_shape.Dims(1); + // const int filter_cols = filter_shape.Dims(2); - const int filter_spatial_size = filter_rows * filter_cols; - const int padded_filter_inner_dim_size = - ((out_depth + k_packet_size - 1) / k_packet_size) * k_packet_size; + // const int filter_spatial_size = filter_rows * filter_cols; + // const int padded_filter_inner_dim_size = + // ((out_depth + k_packet_size - 1) / k_packet_size) * k_packet_size; _use_padded_filter = (out_depth % k_packet_size) == 0 ? false : true; // prepare padded_filter buffer for cker - auto padded_filter_info = ir::OperandInfo(_kernel->get_info()); + /* auto padded_filter_info = ir::OperandInfo(_kernel->get_info()); padded_filter_info.shape({batch, filter_spatial_size, padded_filter_inner_dim_size}); _padded_filter = std::make_unique(padded_filter_info); _padded_filter->setBuffer(std::make_shared(_padded_filter->total_size())); @@ -108,6 +110,55 @@ void DepthwiseConvolutionLayer::configureBackward(IPortableTensor *back_prop_inp _filter_dim_buffers = std::make_unique(filter_dim_buffers_info); _filter_dim_buffers->setBuffer( std::make_shared(_filter_dim_buffers->total_size())); + */ +} + +ExtraTensorRequests DepthwiseConvolutionLayer::requestExtraTensors() +{ + ExtraTensorRequests reqs; + + reqs.push_back(ExtraTensorRequest::createLike(_back_prop_output, &_act_back_prop_output)); + + const auto incoming_shape = getShape(_back_prop_output); + const auto batch = incoming_shape.Dims(0); + const auto depth = incoming_shape.Dims(3); + + const auto filter_shape = getShape(_kernel); + const int filter_rows = filter_shape.Dims(1); + const int filter_cols = filter_shape.Dims(2); + const int filter_spatial_size = filter_rows * filter_cols; + + const auto k_packet_size = _dconv_kernel->kPacketSize(); + const int padded_filter_inner_dim_size = + ((depth + k_packet_size - 1) / k_packet_size) * k_packet_size; + + const int thread_count = _dconv_kernel->getThreadCount(); + + // _padded_filter + { + auto type_info = _kernel->get_info().typeInfo(); + ir::Shape shape({batch, filter_spatial_size, padded_filter_inner_dim_size}); + auto info = ir::OperandInfo::createStaticInfo(shape, type_info); + reqs.emplace_back(info, ExtraTensorLifeTime::BACKWARD, &_padded_filter); + } + + // _filter_buffers + { + auto type_info = _kernel->get_info().typeInfo(); + ir::Shape shape({thread_count, filter_spatial_size, padded_filter_inner_dim_size}); + auto info = ir::OperandInfo::createStaticInfo(shape, type_info); + reqs.emplace_back(info, ExtraTensorLifeTime::BACKWARD, &_filter_buffers); + } + + // _filter_dim_buffers + { + auto type = _back_prop_input->get_info().typeInfo(); + ir::Shape shape({thread_count, padded_filter_inner_dim_size}); + auto info = ir::OperandInfo::createStaticInfo(shape, type); + reqs.emplace_back(info, ExtraTensorLifeTime::BACKWARD, &_filter_dim_buffers); + } + + return reqs; } void DepthwiseConvolutionLayer::forward(bool) { cpu::ops::DepthwiseConvolutionLayer::run(); } @@ -136,7 +187,7 @@ void DepthwiseConvolutionLayer::backwardFloat32() try { backprop_act = - backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output.get()); + backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output); } catch (const std::exception &e) { @@ -156,15 +207,15 @@ void DepthwiseConvolutionLayer::backwardFloat32() // Calculate gradient for input nnfw::cker::train::backpropInput( dconv_params, getShape(backprop_act), getBuffer(backprop_act), getShape(_kernel), - getBuffer(_kernel), getBuffer(_padded_filter.get()), getShape(_back_prop_input), - getBuffer(_back_prop_input), _use_padded_filter, getBuffer(_filter_buffers.get()), - getBuffer(_filter_dim_buffers.get())); + getBuffer(_kernel), getBuffer(_padded_filter), getShape(_back_prop_input), + getBuffer(_back_prop_input), _use_padded_filter, getBuffer(_filter_buffers), + getBuffer(_filter_dim_buffers)); // Calculate gradient for weights nnfw::cker::train::backpropFilter( dconv_params, getShape(backprop_act), getBuffer(backprop_act), getShape(_input), getBuffer(_input), getShape(_grad_weights), getBuffer(_grad_weights), - getBuffer(_padded_filter.get()), getBuffer(_filter_buffers.get())); + getBuffer(_padded_filter), getBuffer(_filter_buffers)); // Calculate gradient for bias if (_bias) diff --git a/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.h b/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.h index 5cd98e56721..1c98b24e18d 100644 --- a/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.h +++ b/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.h @@ -42,6 +42,8 @@ class DepthwiseConvolutionLayer : public ::onert::exec::train::ITrainableFunctio void configureBackward(IPortableTensor *back_prop_input, IPortableTensor *grad_weights, IPortableTensor *grad_bias, const IPortableTensor *back_prop_output, const ir::Activation activation); + + ExtraTensorRequests requestExtraTensors() override; void forward(bool training) override; void backward() override; @@ -54,12 +56,12 @@ class DepthwiseConvolutionLayer : public ::onert::exec::train::ITrainableFunctio IPortableTensor *_back_prop_input; const IPortableTensor *_back_prop_output; - // TODO Consider if these tensors should be built in TensorBuilder - std::unique_ptr _act_back_prop_output; + ExtraTensor *_act_back_prop_output; + bool _use_padded_filter; - std::unique_ptr _padded_filter; - std::unique_ptr _filter_buffers; - std::unique_ptr _filter_dim_buffers; + ExtraTensor *_padded_filter; + ExtraTensor *_filter_buffers; + ExtraTensor *_filter_dim_buffers; }; } // namespace ops diff --git a/runtime/onert/backend/train/ops/FullyConnectedLayer.cc b/runtime/onert/backend/train/ops/FullyConnectedLayer.cc index 9d35655b26f..f2e189134c7 100644 --- a/runtime/onert/backend/train/ops/FullyConnectedLayer.cc +++ b/runtime/onert/backend/train/ops/FullyConnectedLayer.cc @@ -28,17 +28,24 @@ namespace using namespace onert; -std::unique_ptr -createTransposedTensor(const backend::IPortableTensor *origin_tensor) +ir::OperandInfo transposeOperandInfo(const ir::OperandInfo &origin_info) { - const auto &origin_shape = origin_tensor->getShape(); + const auto &origin_shape = origin_info.shape(); assert(origin_shape.rank() == 2); - auto transposed_info = origin_tensor->get_info(); + auto transposed_info = ir::OperandInfo(origin_info); auto transposed_shape = ir::Shape{origin_shape.dim(1), origin_shape.dim(0)}; transposed_info.shape(transposed_shape); - return std::make_unique(transposed_info); + return transposed_info; +} + +backend::train::ExtraTensorRequest +createTransposeTenosrRequest(const backend::IPortableTensor *origin, + backend::train::ExtraTensor **const addr) +{ + return backend::train::ExtraTensorRequest(transposeOperandInfo(origin->get_info()), + backend::train::ExtraTensorLifeTime::BACKWARD, addr); } } // namespace @@ -85,22 +92,27 @@ void FullyConnectedLayer::configureBackward( throw std::runtime_error{ "train FullyConnectedLayer: Input other ranks than 2 are not supported."}; - _transposed_weights = createTransposedTensor(weights); - _transposed_weights->setBuffer(std::make_shared(weights->total_size())); + if (activation != ir::Activation::NONE) + { + } +} + +ExtraTensorRequests FullyConnectedLayer::requestExtraTensors() +{ + ExtraTensorRequests reqs; + + reqs.push_back(createTransposeTenosrRequest(_weights, &_transposed_weights)); - _transposed_input = createTransposedTensor(input); - _transposed_input->setBuffer(std::make_shared(input->total_size())); + reqs.push_back(createTransposeTenosrRequest(_input, &_transposed_input)); - _transposed_back_prop_output = createTransposedTensor(back_prop_output); - _transposed_back_prop_output->setBuffer( - std::make_shared(back_prop_output->total_size())); + reqs.push_back(createTransposeTenosrRequest(_back_prop_output, &_transposed_back_prop_output)); - if (activation != ir::Activation::NONE) + if (_activation != ir::Activation::NONE) { - _act_back_prop_output = std::make_unique(_back_prop_output->get_info()); - _act_back_prop_output->setBuffer( - std::make_shared(_back_prop_output->total_size())); + reqs.push_back(ExtraTensorRequest::createLike(_back_prop_output, &_act_back_prop_output)); } + + return reqs; } void FullyConnectedLayer::forward(bool) { cpu::ops::FullyConnectedLayer::run(); } @@ -130,7 +142,7 @@ void FullyConnectedLayer::backwardFloat32() try { backprop_act = - backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output.get()); + backpropActivation(_activation, _output, _back_prop_output, _act_back_prop_output); } catch (const std::exception &e) { @@ -157,7 +169,7 @@ void FullyConnectedLayer::backwardFloat32() // Transpose and compute gradient for input // ∂L/∂X = fc(Incoming gradient, transposed W) - auto transposed_weights = _transposed_weights.get(); + auto transposed_weights = _transposed_weights; assert(transposed_weights->getShape().rank() == 2); nnfw::cker::Transpose(transpose_param, getShape(_weights), getBuffer(_weights), getShape(transposed_weights), getBuffer(transposed_weights)); @@ -169,12 +181,12 @@ void FullyConnectedLayer::backwardFloat32() // Transpose and compute gradient for weights // ∂L/∂W = fc(transposed incomming gradient, transposed X) - auto transposed_input = _transposed_input.get(); + auto transposed_input = _transposed_input; assert(transposed_input->getShape().rank() == 2); nnfw::cker::Transpose(transpose_param, getShape(_input), getBuffer(_input), getShape(transposed_input), getBuffer(transposed_input)); - auto transposed_back_prop_output = _transposed_back_prop_output.get(); + auto transposed_back_prop_output = _transposed_back_prop_output; assert(transposed_back_prop_output->getShape().rank() == 2); nnfw::cker::Transpose(transpose_param, getShape(backprop_act), getBuffer(backprop_act), getShape(transposed_back_prop_output), diff --git a/runtime/onert/backend/train/ops/FullyConnectedLayer.h b/runtime/onert/backend/train/ops/FullyConnectedLayer.h index 190bfbffe42..a19da636d8d 100644 --- a/runtime/onert/backend/train/ops/FullyConnectedLayer.h +++ b/runtime/onert/backend/train/ops/FullyConnectedLayer.h @@ -46,6 +46,7 @@ class FullyConnectedLayer : public exec::train::ITrainableFunction, const IPortableTensor *back_prop_output, ir::Activation activation, ir::FullyConnectedWeightsFormat weights_format); + ExtraTensorRequests requestExtraTensors() override; void forward(bool training) override; void backward() override; @@ -58,11 +59,10 @@ class FullyConnectedLayer : public exec::train::ITrainableFunction, IPortableTensor *_back_prop_input; const IPortableTensor *_back_prop_output; - // TODO Optimize memory - std::unique_ptr _transposed_weights; - std::unique_ptr _transposed_input; - std::unique_ptr _transposed_back_prop_output; - std::unique_ptr _act_back_prop_output; + ExtraTensor *_transposed_weights; + ExtraTensor *_transposed_input; + ExtraTensor *_transposed_back_prop_output; + ExtraTensor *_act_back_prop_output; }; } // namespace ops diff --git a/runtime/onert/backend/train/ops/PoolLayer.cc b/runtime/onert/backend/train/ops/PoolLayer.cc index f77d58e6517..0e0c468de5e 100644 --- a/runtime/onert/backend/train/ops/PoolLayer.cc +++ b/runtime/onert/backend/train/ops/PoolLayer.cc @@ -41,78 +41,90 @@ class MaxPool2D final : public TrainingKernelRegistry const IPortableTensor *_output; nnfw::cker::PoolParams _op_params; - std::unique_ptr _act_back_prop_output; - std::unique_ptr _arg_max_index; + ExtraTensor *_act_back_prop_output; + ExtraTensor *_arg_max_index; public: MaxPool2D(const uint32_t paddingLeft, const uint32_t, const uint32_t paddingTop, const uint32_t, const uint32_t strideWidth, const uint32_t strideHeight, const uint32_t kernelWidth, const uint32_t kernelHeight, const ir::Activation activation, const IPortableTensor *output) - : _activation(activation), _output(output) - { - { - _op_params.stride_height = strideHeight; - _op_params.stride_width = strideWidth; - _op_params.filter_height = kernelHeight; - _op_params.filter_width = kernelWidth; - _op_params.padding_values.height = (int8_t)paddingTop; - _op_params.padding_values.width = (int8_t)paddingLeft; - CalculateActivationRange(activation, &_op_params.float_activation_min, - &_op_params.float_activation_max); - } - - _arg_max_index = std::make_unique(_output->get_info()); - _arg_max_index->setBuffer(std::make_shared(_output->total_size())); - - if (activation != ir::Activation::NONE) - { - _act_back_prop_output = std::make_unique(_output->get_info()); - _act_back_prop_output->setBuffer(std::make_shared(_output->total_size())); - } - }; - - ~MaxPool2D() {} + : _activation(activation), _output(output){{_op_params.stride_height = strideHeight; + _op_params.stride_width = strideWidth; + _op_params.filter_height = kernelHeight; + _op_params.filter_width = kernelWidth; + _op_params.padding_values.height = (int8_t)paddingTop; + _op_params.padding_values.width = (int8_t)paddingLeft; + CalculateActivationRange(activation, &_op_params.float_activation_min, + &_op_params.float_activation_max); +} + +/* +_arg_max_index = std::make_unique(_output->get_info()); +_arg_max_index->setBuffer(std::make_shared(_output->total_size())); + +if (activation != ir::Activation::NONE) +{ + _act_back_prop_output = std::make_unique(_output->get_info()); + _act_back_prop_output->setBuffer(std::make_shared(_output->total_size())); +} +*/ +}; // namespace + +~MaxPool2D() {} public: - void forward(const IPortableTensor *in, IPortableTensor *out) - { - assert(in->layout() == ir::Layout::NHWC); +ExtraTensorRequests requestExtraTensors() override +{ + auto r1 = ExtraTensorRequest(_output->get_info(), ExtraTensorLifeTime::FORWARD_TO_BACKWARD, + &_arg_max_index); + auto r2 = ExtraTensorRequest::createLike(_output, &_act_back_prop_output); - auto out_shape = getShape(out); - auto out_data = getBuffer(out); - auto arg_max_index = _arg_max_index.get(); + ExtraTensorRequests reqs; + reqs.push_back(r1); + reqs.push_back(r2); - // maxpool forward - nnfw::cker::train::MaxPool2D(_op_params, getShape(in), getBuffer(in), out_shape, - out_data, getBuffer(arg_max_index)); - } + return reqs; +} + +public: +void forward(const IPortableTensor *in, IPortableTensor *out) override +{ + assert(in->layout() == ir::Layout::NHWC); - void backward(const IPortableTensor *back_prop_out, IPortableTensor *back_prop_in) + auto out_shape = getShape(out); + auto out_data = getBuffer(out); + auto arg_max_index = _arg_max_index; + + // maxpool forward + nnfw::cker::train::MaxPool2D(_op_params, getShape(in), getBuffer(in), out_shape, out_data, + getBuffer(arg_max_index)); +} + +void backward(const IPortableTensor *back_prop_out, IPortableTensor *back_prop_in) override +{ + assert(back_prop_out->layout() == ir::Layout::NHWC); + + // activation backward + try + { + back_prop_out = backpropActivation(_activation, _output, back_prop_out, _act_back_prop_output); + } + catch (const std::exception &e) { - assert(back_prop_out->layout() == ir::Layout::NHWC); - - // activation backward - try - { - back_prop_out = - backpropActivation(_activation, _output, back_prop_out, _act_back_prop_output.get()); - } - catch (const std::exception &e) - { - throw std::runtime_error{"train PoolLayer: " + std::string(e.what())}; - } - assert(back_prop_out != nullptr); - - // maxpool baackward - auto arg_max_index = _arg_max_index.get(); - nnfw::cker::train::MaxPool2DGrad(getShape(back_prop_out), getBuffer(back_prop_out), - getBuffer(arg_max_index), getShape(back_prop_in), - getBuffer(back_prop_in)); + throw std::runtime_error{"train PoolLayer: " + std::string(e.what())}; } -}; + assert(back_prop_out != nullptr); -} // namespace + // maxpool baackward + auto arg_max_index = _arg_max_index; + nnfw::cker::train::MaxPool2DGrad(getShape(back_prop_out), getBuffer(back_prop_out), + getBuffer(arg_max_index), getShape(back_prop_in), + getBuffer(back_prop_in)); +} +}; // namespace ops + +} // namespace train PoolLayer::PoolLayer() : cpu::ops::PoolLayer(), _back_prop_input(nullptr), _back_prop_output(nullptr), _kernel(nullptr) @@ -149,6 +161,8 @@ void PoolLayer::configureBackward(const uint32_t paddingLeft, const uint32_t pad } } +ExtraTensorRequests PoolLayer::requestExtraTensors() { return _kernel->requestExtraTensors(); } + void PoolLayer::forward(bool training) { if (training) @@ -163,7 +177,7 @@ void PoolLayer::forward(bool training) void PoolLayer::backward() { _kernel->backward(_back_prop_output, _back_prop_input); } -} // namespace ops -} // namespace train +} // namespace backend +} // namespace onert } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/ops/PoolLayer.h b/runtime/onert/backend/train/ops/PoolLayer.h index 5ced951ae6a..238e5afe77e 100644 --- a/runtime/onert/backend/train/ops/PoolLayer.h +++ b/runtime/onert/backend/train/ops/PoolLayer.h @@ -38,6 +38,8 @@ class TrainingKernelRegistry public: virtual void forward(const IPortableTensor *in, IPortableTensor *out) = 0; virtual void backward(const IPortableTensor *back_prop_out, IPortableTensor *back_prop_in) = 0; + virtual ExtraTensorRequests requestExtraTensors() = 0; + TrainingKernelRegistry() = default; virtual ~TrainingKernelRegistry() = default; }; @@ -61,6 +63,7 @@ class PoolLayer : public ::onert::exec::train::ITrainableFunction, public cpu::o IPortableTensor *output, IPortableTensor *back_prop_input, const IPortableTensor *back_prop_output); + ExtraTensorRequests requestExtraTensors() override; void forward(bool training) override; void backward() override; diff --git a/runtime/onert/core/include/backend/train/ExtraTensor.h b/runtime/onert/core/include/backend/train/ExtraTensor.h new file mode 100644 index 00000000000..20e2bb32e24 --- /dev/null +++ b/runtime/onert/core/include/backend/train/ExtraTensor.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_EXTRA_H__ +#define __ONERT_BACKEND_EXTRA_H__ + +#include + +namespace onert +{ +namespace backend +{ +namespace train +{ + +// Q: Is this renaming is necessary? +// ExtraTensor means that a tensor accessed by only one specific operation layer. +class ExtraTensor final : public basic::Tensor +{ +public: + ExtraTensor() = delete; + +public: + ExtraTensor(const ir::OperandInfo &info) : basic::Tensor(info, nullptr) + { + // DO NOTHING + } +}; + +} // namespace train +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_EXTRA_TENSOR_H__ diff --git a/runtime/onert/core/include/backend/train/ExtraTensorRequest.h b/runtime/onert/core/include/backend/train/ExtraTensorRequest.h new file mode 100644 index 00000000000..25d912fd908 --- /dev/null +++ b/runtime/onert/core/include/backend/train/ExtraTensorRequest.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_EXTRA_TENSOR_REQUEST_H__ +#define __ONERT_BACKEND_EXTRA_TENSOR_REQUEST_H__ + +#include "backend/train/ExtraTensor.h" + +namespace onert +{ +namespace backend +{ +namespace train +{ + +enum class ExtraTensorLifeTime +{ + BACKWARD, // alive during backward() + FORWARD_TO_BACKWARD, // alive from forward to backward() +}; + +class ExtraTensorRequest +{ + +public: + ExtraTensorRequest(ir::OperandInfo info, ExtraTensorLifeTime lt, + backend::train::ExtraTensor **addr) + : info(info), lifetime(lt), address(addr) + { + } + + static ExtraTensorRequest createLike(const IPortableTensor *origin, + backend::train::ExtraTensor **const addr) + { + assert(origin != nullptr); + assert(addr != nullptr); + + return ExtraTensorRequest(origin->get_info(), ExtraTensorLifeTime::BACKWARD, addr); + } + +public: + ir::OperandInfo info; + ExtraTensorLifeTime lifetime; + backend::train::ExtraTensor **address; +}; + +using ExtraTensorRequests = std::vector; + +} // namespace train +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_EXTRA_TENSOR_REQUEST_H__ diff --git a/runtime/onert/core/include/exec/train/ITrainableFunction.h b/runtime/onert/core/include/exec/train/ITrainableFunction.h index 45adc258f68..1d1271186f2 100644 --- a/runtime/onert/core/include/exec/train/ITrainableFunction.h +++ b/runtime/onert/core/include/exec/train/ITrainableFunction.h @@ -18,6 +18,7 @@ #define __ONERT_EXEC_TRAIN_I_TRAINABLE_FUNCTION_H__ #include +#include namespace onert { @@ -26,12 +27,21 @@ namespace exec namespace train { +// Q: function 'extraExtraTensors' is not PURE virutal function, If so, Do we need to change this +// class name? ITrainableFunction -> TrainableFunction class ITrainableFunction { public: virtual ~ITrainableFunction() = default; virtual void forward(bool training) = 0; virtual void backward() = 0; + + // Implement this if extra tensor is needed + virtual backend::train::ExtraTensorRequests requestExtraTensors() + { + backend::train::ExtraTensorRequests r; + return r; + } }; } // namespace train