Skip to content

Commit

Permalink
ARROW-5736: [Format][C++] Support small bit-width indices in sparse t…
Browse files Browse the repository at this point in the history
…ensor

I'd like to enable all integer types to become the value type of sparse tensor index.

Closes apache#5290 from mrkn/ARROW-5736 and squashes the following commits:

b0be574 <Kenta Murata> Add check SparseCOOIndex::coords_->ndim() in the constructor
ad725c2 <Kenta Murata> Remove needless include
84e6726 <Kenta Murata> Fix typo
76e5ca7 <Kenta Murata> Increase coverage rate
2226c9a <Kenta Murata> Fix compile errors on MSVC
f1cf53f <Kenta Murata> Insert an explicit down cast
56ffc1b <Kenta Murata> Add #include <limits>
a3fd502 <Kenta Murata> Refactoring
f0222ab <Kenta Murata> Suppor reading and writing
f2a1920 <Kenta Murata> Use fixtures in TestSparseCSRMatrix
4079906 <Kenta Murata> Refactoring of TestSparseCOOTensor
980cf96 <Kenta Murata> Refactoring
becc425 <Kenta Murata> Enable all integer types to become the value type of SparseCOOIndex
46acdb1 <Kenta Murata> Refactor ipc-read-write-test with a typed test case
b0053de <Kenta Murata> Add assertions for dim_names and dim_name
b7b47c7 <Kenta Murata> Refactor sparse_tensor-test with a typed test case
45be5c4 <Kenta Murata> Support reading and writing row-major SparseCOOIndex
33b6d1f <Kenta Murata> Make SparseCOOIndex support row-major index
5f8ba9f <Kenta Murata> Add indicesStrides in SparseTensorIndexCOO
5ec7e65 <Kenta Murata> Modify the comment of SparseTensorIndexCOO
8a0f65b <Kenta Murata> Add type fields of sparse index types

Authored-by: Kenta Murata <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
mrkn authored and pitrou committed Sep 10, 2019
1 parent b1025c2 commit 0fbaff6
Show file tree
Hide file tree
Showing 9 changed files with 727 additions and 306 deletions.
50 changes: 47 additions & 3 deletions cpp/src/arrow/ipc/metadata_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -967,9 +967,23 @@ Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index,
flatbuf::SparseTensorIndex* fb_sparse_index_type,
Offset* fb_sparse_index, size_t* num_buffers) {
*fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseTensorIndexCOO;

// We assume that the value type of indices tensor is an integer.
const auto& index_value_type =
checked_cast<const IntegerType&>(*sparse_index.indices()->type());
auto indices_type_offset =
flatbuf::CreateInt(fbb, index_value_type.bit_width(), index_value_type.is_signed());

auto fb_strides =
fbb.CreateVector(util::MakeNonNull(sparse_index.indices()->strides().data()),
sparse_index.indices()->strides().size());

const BufferMetadata& indices_metadata = buffers[0];
flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length);
*fb_sparse_index = flatbuf::CreateSparseTensorIndexCOO(fbb, &indices).Union();

*fb_sparse_index =
flatbuf::CreateSparseTensorIndexCOO(fbb, indices_type_offset, fb_strides, &indices)
.Union();
*num_buffers = 1;
return Status::OK();
}
Expand All @@ -979,11 +993,28 @@ Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index,
flatbuf::SparseTensorIndex* fb_sparse_index_type,
Offset* fb_sparse_index, size_t* num_buffers) {
*fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR;

// We assume that the value type of indptr tensor is an integer.
const auto& indptr_value_type =
checked_cast<const IntegerType&>(*sparse_index.indptr()->type());
auto indptr_type_offset = flatbuf::CreateInt(fbb, indptr_value_type.bit_width(),
indptr_value_type.is_signed());

const BufferMetadata& indptr_metadata = buffers[0];
const BufferMetadata& indices_metadata = buffers[1];
flatbuf::Buffer indptr(indptr_metadata.offset, indptr_metadata.length);

// We assume that the value type of indices tensor is an integer.
const auto& indices_value_type =
checked_cast<const IntegerType&>(*sparse_index.indices()->type());
auto indices_type_offset = flatbuf::CreateInt(fbb, indices_value_type.bit_width(),
indices_value_type.is_signed());

const BufferMetadata& indices_metadata = buffers[1];
flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length);
*fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, &indptr, &indices).Union();

*fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, indptr_type_offset, &indptr,
indices_type_offset, &indices)
.Union();
*num_buffers = 2;
return Status::OK();
}
Expand Down Expand Up @@ -1189,6 +1220,19 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type);
}

Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index,
std::shared_ptr<DataType>* indices_type) {
return IntFromFlatbuffer(sparse_index->indicesType(), indices_type);
}

Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index,
std::shared_ptr<DataType>* indptr_type,
std::shared_ptr<DataType>* indices_type) {
RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indptrType(), indptr_type));
RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indicesType(), indices_type));
return Status::OK();
}

Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
std::vector<int64_t>* shape,
std::vector<std::string>* dim_names,
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/arrow/ipc/metadata_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
std::vector<int64_t>* shape, std::vector<int64_t>* strides,
std::vector<std::string>* dim_names);

// EXPERIMENTAL: Extracting metadata of a SparseCOOIndex from the message
Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index,
std::shared_ptr<DataType>* indices_type);

// EXPERIMENTAL: Extracting metadata of a SparseCSRIndex from the message
Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index,
std::shared_ptr<DataType>* indptr_type,
std::shared_ptr<DataType>* indices_type);

// EXPERIMENTAL: Extracting metadata of a sparse tensor from the message
Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
std::vector<int64_t>* shape,
Expand Down
182 changes: 148 additions & 34 deletions cpp/src/arrow/ipc/read_write_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1043,34 +1043,58 @@ TEST_F(TestTensorRoundTrip, NonContiguous) {
CheckTensorRoundTrip(tensor);
}

template <typename IndexValueType>
class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture {
public:
void SetUp() { IpcTestFixture::SetUp(); }
void TearDown() { IpcTestFixture::TearDown(); }

template <typename SparseIndexType>
void CheckSparseTensorRoundTrip(const SparseTensorImpl<SparseIndexType>& tensor) {
GTEST_FAIL();
void CheckSparseTensorRoundTrip(const SparseTensorCOO& sparse_tensor);
void CheckSparseTensorRoundTrip(const SparseTensorCSR& sparse_tensor);

protected:
std::shared_ptr<SparseCOOIndex> MakeSparseCOOIndex(
const std::vector<int64_t>& coords_shape,
const std::vector<int64_t>& coords_strides,
std::vector<typename IndexValueType::c_type>& coords_values) const {
auto coords_data = Buffer::Wrap(coords_values);
auto coords = std::make_shared<NumericTensor<IndexValueType>>(
coords_data, coords_shape, coords_strides);
return std::make_shared<SparseCOOIndex>(coords);
}

template <typename ValueType>
std::shared_ptr<SparseTensorCOO> MakeSparseTensorCOO(
const std::shared_ptr<SparseCOOIndex>& si, std::vector<ValueType>& sparse_values,
const std::vector<int64_t>& shape,
const std::vector<std::string>& dim_names = {}) const {
auto data = Buffer::Wrap(sparse_values);
return std::make_shared<SparseTensorCOO>(si, CTypeTraits<ValueType>::type_singleton(),
data, shape, dim_names);
}
};

template <>
void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip<SparseCOOIndex>(
const SparseTensorImpl<SparseCOOIndex>& tensor) {
const auto& type = checked_cast<const FixedWidthType&>(*tensor.type());
template <typename IndexValueType>
void TestSparseTensorRoundTrip<IndexValueType>::CheckSparseTensorRoundTrip(
const SparseTensorCOO& sparse_tensor) {
const auto& type = checked_cast<const FixedWidthType&>(*sparse_tensor.type());
const int elem_size = type.bit_width() / 8;
const int index_elem_size = sizeof(typename IndexValueType::c_type);

int32_t metadata_length;
int64_t body_length;

ASSERT_OK(mmap_->Seek(0));

ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length,
ASSERT_OK(WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length,
default_memory_pool()));

const auto& sparse_index = checked_cast<const SparseCOOIndex&>(*tensor.sparse_index());
const int64_t indices_length = elem_size * sparse_index.indices()->size();
const int64_t data_length = elem_size * tensor.non_zero_length();
const auto& sparse_index =
checked_cast<const SparseCOOIndex&>(*sparse_tensor.sparse_index());
const int64_t indices_length =
BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size());
const int64_t data_length =
BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length());
const int64_t expected_body_length = indices_length + data_length;
ASSERT_EQ(expected_body_length, body_length);

Expand All @@ -1083,27 +1107,32 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip<SparseCOOIndex>(
checked_cast<const SparseCOOIndex&>(*result->sparse_index());
ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length);
ASSERT_EQ(result->data()->size(), data_length);
ASSERT_TRUE(result->Equals(*result));
ASSERT_TRUE(result->Equals(sparse_tensor));
}

template <>
void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip<SparseCSRIndex>(
const SparseTensorImpl<SparseCSRIndex>& tensor) {
const auto& type = checked_cast<const FixedWidthType&>(*tensor.type());
template <typename IndexValueType>
void TestSparseTensorRoundTrip<IndexValueType>::CheckSparseTensorRoundTrip(
const SparseTensorCSR& sparse_tensor) {
const auto& type = checked_cast<const FixedWidthType&>(*sparse_tensor.type());
const int elem_size = type.bit_width() / 8;
const int index_elem_size = sizeof(typename IndexValueType::c_type);

int32_t metadata_length;
int64_t body_length;

ASSERT_OK(mmap_->Seek(0));

ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length,
ASSERT_OK(WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length,
default_memory_pool()));

const auto& sparse_index = checked_cast<const SparseCSRIndex&>(*tensor.sparse_index());
const int64_t indptr_length = elem_size * sparse_index.indptr()->size();
const int64_t indices_length = elem_size * sparse_index.indices()->size();
const int64_t data_length = elem_size * tensor.non_zero_length();
const auto& sparse_index =
checked_cast<const SparseCSRIndex&>(*sparse_tensor.sparse_index());
const int64_t indptr_length =
BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indptr()->size());
const int64_t indices_length =
BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size());
const int64_t data_length =
BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length());
const int64_t expected_body_length = indptr_length + indices_length + data_length;
ASSERT_EQ(expected_body_length, body_length);

Expand All @@ -1117,30 +1146,103 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip<SparseCSRIndex>(
ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length);
ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length);
ASSERT_EQ(result->data()->size(), data_length);
ASSERT_TRUE(result->Equals(*result));
ASSERT_TRUE(result->Equals(sparse_tensor));
}

TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) {
TYPED_TEST_CASE_P(TestSparseTensorRoundTrip);

TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor) {
using IndexValueType = TypeParam;
using c_index_value_type = typename IndexValueType::c_type;

std::string path = "test-write-sparse-coo-tensor";
constexpr int64_t kBufferSize = 1 << 20;
ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_));
ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_));

// Dense representation:
// [
// [
// 1 0 2 0
// 0 3 0 4
// 5 0 6 0
// ],
// [
// 0 11 0 12
// 13 0 14 0
// 0 15 0 16
// ]
// ]
//
// Sparse representation:
// idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1]
// idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2]
// idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3]
// data = [1 2 3 4 5 6 11 12 13 14 15 16]

std::vector<c_index_value_type> coords_values = {0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 3,
0, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 3,
1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 3};
const int sizeof_index_value = sizeof(c_index_value_type);
auto si = this->MakeSparseCOOIndex(
{12, 3}, {sizeof_index_value * 3, sizeof_index_value}, coords_values);

std::vector<int64_t> shape = {2, 3, 4};
std::vector<std::string> dim_names = {"foo", "bar", "baz"};
std::vector<int64_t> values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0,
0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16};
std::vector<int64_t> values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16};
auto st = this->MakeSparseTensorCOO(si, values, shape, dim_names);

auto data = Buffer::Wrap(values);
NumericTensor<Int64Type> t(data, shape, {}, dim_names);
SparseTensorImpl<SparseCOOIndex> st(t);
this->CheckSparseTensorRoundTrip(*st);
}

TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexColumnMajor) {
using IndexValueType = TypeParam;
using c_index_value_type = typename IndexValueType::c_type;

std::string path = "test-write-sparse-coo-tensor";
constexpr int64_t kBufferSize = 1 << 20;
ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_));

// Dense representation:
// [
// [
// 1 0 2 0
// 0 3 0 4
// 5 0 6 0
// ],
// [
// 0 11 0 12
// 13 0 14 0
// 0 15 0 16
// ]
// ]
//
// Sparse representation:
// idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1]
// idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2]
// idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3]
// data = [1 2 3 4 5 6 11 12 13 14 15 16]

std::vector<c_index_value_type> coords_values = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2,
0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3};
const int sizeof_index_value = sizeof(c_index_value_type);
auto si = this->MakeSparseCOOIndex(
{12, 3}, {sizeof_index_value, sizeof_index_value * 12}, coords_values);

CheckSparseTensorRoundTrip(st);
std::vector<int64_t> shape = {2, 3, 4};
std::vector<std::string> dim_names = {"foo", "bar", "baz"};
std::vector<int64_t> values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16};
auto st = this->MakeSparseTensorCOO(si, values, shape, dim_names);

this->CheckSparseTensorRoundTrip(*st);
}

TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) {
TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSRIndex) {
using IndexValueType = TypeParam;

std::string path = "test-write-sparse-csr-matrix";
constexpr int64_t kBufferSize = 1 << 20;
ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_));
ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_));

std::vector<int64_t> shape = {4, 6};
std::vector<std::string> dim_names = {"foo", "bar", "baz"};
Expand All @@ -1149,11 +1251,23 @@ TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) {

auto data = Buffer::Wrap(values);
NumericTensor<Int64Type> t(data, shape, {}, dim_names);
SparseTensorImpl<SparseCSRIndex> st(t);
SparseTensorImpl<SparseCSRIndex> st(t, TypeTraits<IndexValueType>::type_singleton());

CheckSparseTensorRoundTrip(st);
this->CheckSparseTensorRoundTrip(st);
}

REGISTER_TYPED_TEST_CASE_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor,
WithSparseCOOIndexColumnMajor, WithSparseCSRIndex);

INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseTensorRoundTrip, Int8Type);
INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseTensorRoundTrip, UInt8Type);
INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseTensorRoundTrip, Int16Type);
INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseTensorRoundTrip, UInt16Type);
INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseTensorRoundTrip, Int32Type);
INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseTensorRoundTrip, UInt32Type);
INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseTensorRoundTrip, Int64Type);
INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseTensorRoundTrip, UInt64Type);

TEST(TestRecordBatchStreamReader, MalformedInput) {
const std::string empty_str = "";
const std::string garbage_str = "12345678";
Expand Down
Loading

0 comments on commit 0fbaff6

Please sign in to comment.