Skip to content

Commit

Permalink
[onert] Save Reserved and Tensor data to the checkpoint file
Browse files Browse the repository at this point in the history
ONE-DCO-1.0-Signed-off-by: Jiyoung Yun <[email protected]>
  • Loading branch information
jyoungyun committed Aug 2, 2024
1 parent d79e86d commit cc026ce
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 27 deletions.
13 changes: 7 additions & 6 deletions runtime/onert/core/include/exporter/CheckpointExporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,20 @@ class CheckpointExporter
{
public:
CheckpointExporter(std::unique_ptr<onert::ir::train::TrainingInfo> &train_info,
std::unique_ptr<onert::exec::Execution> &execution);
std::unique_ptr<onert::exec::Execution> &exec);

void save(const std::string &path);

private:
uint32_t getTotalSize();
void setReservedData();
void setAdamOffset(uint32_t m_offset, uint32_t v_offset);
void setTensorData(std::unique_ptr<onert::exec::Execution> &exec);

private:
const uint16_t MAGIC_NUMBER = 429;
const uint8_t SCHEMA_VERSION = 1;
const uint8_t RESERVED = 0;
const uint32_t RESERVED_SIZE = 16;

std::vector<char> _data;
std::vector<char> _reserved;
std::vector<char> _buffers;
std::mutex _mutex;
};

Expand Down
112 changes: 91 additions & 21 deletions runtime/onert/core/src/exporter/CheckpointExporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <fstream>
#include <iostream>
#include <numeric>

namespace onert
{
Expand All @@ -30,42 +31,111 @@ namespace exporter
CheckpointExporter::CheckpointExporter(std::unique_ptr<onert::ir::train::TrainingInfo> &train_info,
std::unique_ptr<onert::exec::Execution> &execution)
{
uint32_t total_size = getTotalSize();
_data.resize(total_size);
setReservedData();
setTensorData(execution);

// Point to start of the buffer
char *ptr = _data.data();
UNUSED_RELEASE(train_info);
}

void CheckpointExporter::save(const std::string &path)
{
if (_reserved.size() != RESERVED_SIZE)
throw std::runtime_error{"Invalid reserved buffer"};

std::ofstream dst(path.c_str(), std::ios::binary | std::ios::trunc);
if (!dst.is_open())
throw std::runtime_error{"Failed to save checkpoint: " + path};

dst.write(_reserved.data(), _reserved.size());
dst.write(_buffers.data(), _buffers.size());
dst.close();
}

void CheckpointExporter::setReservedData()
{
// Reserved - 16 bytes
// magic number for 2 bytes
// schema version for 1 byte
// reserved for 1 byte
// offset for 4 * 3 bytes
// (moving average, value, other parameters offset)

_reserved.resize(RESERVED_SIZE);

// Pointer to the start address of the buffer
char *ptr = _reserved.data();

// Write MAGIC NUMBER
const uint16_t MAGIC_NUMBER = 429;
std::memcpy(ptr, &MAGIC_NUMBER, sizeof(MAGIC_NUMBER));
ptr += sizeof(MAGIC_NUMBER);

// Write SCHEMA VERSION
const uint8_t SCHEMA_VERSION = 1;
std::memcpy(ptr, &SCHEMA_VERSION, sizeof(SCHEMA_VERSION));
ptr += sizeof(SCHEMA_VERSION);

// Reserved
ptr += sizeof(RESERVED);

UNUSED_RELEASE(train_info);
UNUSED_RELEASE(execution);
}

void CheckpointExporter::save(const std::string &path)
void CheckpointExporter::setAdamOffset(uint32_t m_offset, uint32_t v_offset)
{
std::ofstream dst(path.c_str(), std::ios::binary | std::ios::trunc);
if (!dst.is_open())
throw std::runtime_error{"Failed to save checkpoint: " + path};
if (_reserved.size() != RESERVED_SIZE)
throw std::runtime_error{"Invalid reserved buffer"};

dst.write(_data.data(), _data.size());
dst.close();
// Pointer to the start address of the buffer
char *ptr = _reserved.data();
ptr += 4; // magic number(2) + schema version(1) + reserved(1)

// Write Adam M offset
memcpy(ptr, &m_offset, sizeof(m_offset));
ptr += sizeof(m_offset);

// Write Adam V offset
memcpy(ptr, &v_offset, sizeof(v_offset));
}

uint32_t CheckpointExporter::getTotalSize()
void CheckpointExporter::setTensorData(std::unique_ptr<onert::exec::Execution> &exec)
{
uint32_t size = 0;
size += (sizeof(MAGIC_NUMBER) + sizeof(SCHEMA_VERSION) + sizeof(RESERVED));
return size;
// Tensor Buffers
// number of buffers for 4 bytes
// 1..N offset for 4 * N bytes
// buffers for buf_1 size + buf_2 size + .. buf_N size bytes

// get Tensor count
std::vector<uint32_t> sizes;
exec->iterateTrainableTensors(
[&](const ir::OperandIndex &, const backend::train::ITrainableTensor *tensor) {
sizes.emplace_back(tensor->total_size());
});

uint32_t count = sizes.size();
uint32_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);
auto buf_size = sizeof(uint32_t) + sizeof(uint32_t) * count + total_size;
_buffers.resize(buf_size);

// Pointer to the start address of the buffer
char *ptr = _buffers.data();

// Write n_buffers
std::memcpy(ptr, &count, sizeof(count));
ptr += sizeof(count);

// Write offset
uint32_t buf_offset = RESERVED_SIZE + sizeof(count) + sizeof(uint32_t) * count;
for (uint32_t v : sizes)
{
std::memcpy(ptr, &buf_offset, sizeof(buf_offset));
ptr += sizeof(buf_offset);

buf_offset += v;
}

// Write tensor buffers
auto vindex = 0;
exec->iterateTrainableTensors(
[&](const ir::OperandIndex &, const backend::train::ITrainableTensor *tensor) {
assert(sizes[vindex++] == tensor->total_size());
std::memcpy(ptr, tensor->buffer(), tensor->total_size());
ptr += tensor->total_size();
});
}

} // namespace exporter
Expand Down

0 comments on commit cc026ce

Please sign in to comment.