From 63b876f8fc925b77e7f52abe963ec6d23412b73a Mon Sep 17 00:00:00 2001 From: Jiyoung Yun Date: Fri, 2 Aug 2024 14:37:00 +0900 Subject: [PATCH] [onert] Save Reserved and Tensor data to the checkpoint file ONE-DCO-1.0-Signed-off-by: Jiyoung Yun --- .../include/exporter/CheckpointExporter.h | 13 +- .../core/src/exporter/CheckpointExporter.cc | 112 ++++++++++++++---- 2 files changed, 98 insertions(+), 27 deletions(-) diff --git a/runtime/onert/core/include/exporter/CheckpointExporter.h b/runtime/onert/core/include/exporter/CheckpointExporter.h index b2335c07a1e..33c532f0658 100644 --- a/runtime/onert/core/include/exporter/CheckpointExporter.h +++ b/runtime/onert/core/include/exporter/CheckpointExporter.h @@ -42,19 +42,20 @@ class CheckpointExporter { public: CheckpointExporter(std::unique_ptr &train_info, - std::unique_ptr &execution); + std::unique_ptr &exec); void save(const std::string &path); private: - uint32_t getTotalSize(); + void setReservedData(); + void setAdamOffset(uint32_t m_offset, uint32_t v_offset); + void setTensorData(std::unique_ptr &exec); private: - const uint16_t MAGIC_NUMBER = 429; - const uint8_t SCHEMA_VERSION = 1; - const uint8_t RESERVED = 0; + const uint32_t RESERVED_SIZE = 16; - std::vector _data; + std::vector _reserved; + std::vector _buffers; std::mutex _mutex; }; diff --git a/runtime/onert/core/src/exporter/CheckpointExporter.cc b/runtime/onert/core/src/exporter/CheckpointExporter.cc index d3d46a78867..5f29518ca60 100644 --- a/runtime/onert/core/src/exporter/CheckpointExporter.cc +++ b/runtime/onert/core/src/exporter/CheckpointExporter.cc @@ -21,6 +21,7 @@ #include #include +#include namespace onert { @@ -30,42 +31,111 @@ namespace exporter CheckpointExporter::CheckpointExporter(std::unique_ptr &train_info, std::unique_ptr &execution) { - uint32_t total_size = getTotalSize(); - _data.resize(total_size); + setReservedData(); + setTensorData(execution); - // Point to start of the buffer - char *ptr = _data.data(); + UNUSED_RELEASE(train_info); +} + +void CheckpointExporter::save(const std::string &path) +{ + if (_reserved.size() != RESERVED_SIZE) + throw std::runtime_error{"Invalid reserved buffer"}; + + std::ofstream dst(path.c_str(), std::ios::binary | std::ios::trunc); + if (!dst.is_open()) + throw std::runtime_error{"Failed to save checkpoint: " + path}; + + dst.write(_reserved.data(), _reserved.size()); + dst.write(_buffers.data(), _buffers.size()); + dst.close(); +} + +void CheckpointExporter::setReservedData() +{ + // Reserved - 16 bytes + // magic number for 2 bytes + // schema version for 1 byte + // reserved for 1 byte + // offset for 4 * 3 bytes + // (moving average, value, other parameters offset) + + _reserved.resize(RESERVED_SIZE); + + // Pointer to the start address of the buffer + char *ptr = _reserved.data(); // Write MAGIC NUMBER + const uint16_t MAGIC_NUMBER = 429; std::memcpy(ptr, &MAGIC_NUMBER, sizeof(MAGIC_NUMBER)); ptr += sizeof(MAGIC_NUMBER); // Write SCHEMA VERSION + const uint8_t SCHEMA_VERSION = 1; std::memcpy(ptr, &SCHEMA_VERSION, sizeof(SCHEMA_VERSION)); - ptr += sizeof(SCHEMA_VERSION); - - // Reserved - ptr += sizeof(RESERVED); - - UNUSED_RELEASE(train_info); - UNUSED_RELEASE(execution); } -void CheckpointExporter::save(const std::string &path) +void CheckpointExporter::setAdamOffset(uint32_t m_offset, uint32_t v_offset) { - std::ofstream dst(path.c_str(), std::ios::binary | std::ios::trunc); - if (!dst.is_open()) - throw std::runtime_error{"Failed to save checkpoint: " + path}; + if (_reserved.size() != RESERVED_SIZE) + throw std::runtime_error{"Invalid reserved buffer"}; - dst.write(_data.data(), _data.size()); - dst.close(); + // Pointer to the start address of the buffer + char *ptr = _reserved.data(); + ptr += 4; // magic number(2) + schema version(1) + reserved(1) + + // Write Adam M offset + memcpy(ptr, &m_offset, sizeof(m_offset)); + ptr += sizeof(m_offset); + + // Write Adam V offset + memcpy(ptr, &v_offset, sizeof(v_offset)); } -uint32_t CheckpointExporter::getTotalSize() +void CheckpointExporter::setTensorData(std::unique_ptr &exec) { - uint32_t size = 0; - size += (sizeof(MAGIC_NUMBER) + sizeof(SCHEMA_VERSION) + sizeof(RESERVED)); - return size; + // Tensor Buffers + // number of buffers for 4 bytes + // 1..N offset for 4 * N bytes + // buffers for buf_1 size + buf_2 size + .. buf_N size bytes + + // get Tensor count + std::vector sizes; + exec->iterateTrainableTensors( + [&](const ir::OperandIndex &, const backend::train::ITrainableTensor *tensor) { + sizes.emplace_back(tensor->total_size()); + }); + + uint32_t count = sizes.size(); + uint32_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0); + auto buf_size = sizeof(uint32_t) + sizeof(uint32_t) * count + total_size; + _buffers.resize(buf_size); + + // Pointer to the start address of the buffer + char *ptr = _buffers.data(); + + // Write n_buffers + std::memcpy(ptr, &count, sizeof(count)); + ptr += sizeof(count); + + // Write offset + uint32_t buf_offset = RESERVED_SIZE + sizeof(count) + sizeof(uint32_t) * count; + for (uint32_t v : sizes) + { + std::memcpy(ptr, &buf_offset, sizeof(buf_offset)); + ptr += sizeof(buf_offset); + + buf_offset += v; + } + + // Write tensor buffers + [[maybe_unused]] auto vindex = 0; + exec->iterateTrainableTensors( + [&](const ir::OperandIndex &, const backend::train::ITrainableTensor *tensor) { + assert(sizes[vindex++] == tensor->total_size()); + std::memcpy(ptr, tensor->buffer(), tensor->total_size()); + ptr += tensor->total_size(); + }); } } // namespace exporter