From 0fbaff6b9ba9fa2c147b333c0c8d0124be706722 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Tue, 10 Sep 2019 16:46:08 +0200 Subject: [PATCH] ARROW-5736: [Format][C++] Support small bit-width indices in sparse tensor I'd like to enable all integer types to become the value type of sparse tensor index. Closes #5290 from mrkn/ARROW-5736 and squashes the following commits: b0be574c3 Add check SparseCOOIndex::coords_->ndim() in the constructor ad725c24e Remove needless include 84e672624 Fix typo 76e5ca73d Increase coverage rate 2226c9a28 Fix compile errors on MSVC f1cf53faa Insert an explicit down cast 56ffc1b08 Add #include a3fd50242 Refactoring f0222abfa Suppor reading and writing f2a192030 Use fixtures in TestSparseCSRMatrix 4079906b9 Refactoring of TestSparseCOOTensor 980cf9668 Refactoring becc425fc Enable all integer types to become the value type of SparseCOOIndex 46acdb10b Refactor ipc-read-write-test with a typed test case b0053decb Add assertions for dim_names and dim_name b7b47c75a Refactor sparse_tensor-test with a typed test case 45be5c4e3 Support reading and writing row-major SparseCOOIndex 33b6d1ff0 Make SparseCOOIndex support row-major index 5f8ba9f4d Add indicesStrides in SparseTensorIndexCOO 5ec7e6554 Modify the comment of SparseTensorIndexCOO 8a0f65b63 Add type fields of sparse index types Authored-by: Kenta Murata Signed-off-by: Antoine Pitrou --- cpp/src/arrow/ipc/metadata_internal.cc | 50 +++- cpp/src/arrow/ipc/metadata_internal.h | 9 + cpp/src/arrow/ipc/read_write_test.cc | 182 +++++++++--- cpp/src/arrow/ipc/reader.cc | 47 +-- cpp/src/arrow/sparse_tensor.cc | 189 +++++++----- cpp/src/arrow/sparse_tensor.h | 30 +- cpp/src/arrow/sparse_tensor_test.cc | 389 +++++++++++++++++-------- cpp/src/arrow/visitor_inline.h | 72 ++--- format/SparseTensor.fbs | 65 +++-- 9 files changed, 727 insertions(+), 306 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 737cd7510a66e..6810351f982fa 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -967,9 +967,23 @@ Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseTensorIndexCOO; + + // We assume that the value type of indices tensor is an integer. + const auto& index_value_type = + checked_cast(*sparse_index.indices()->type()); + auto indices_type_offset = + flatbuf::CreateInt(fbb, index_value_type.bit_width(), index_value_type.is_signed()); + + auto fb_strides = + fbb.CreateVector(util::MakeNonNull(sparse_index.indices()->strides().data()), + sparse_index.indices()->strides().size()); + const BufferMetadata& indices_metadata = buffers[0]; flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); - *fb_sparse_index = flatbuf::CreateSparseTensorIndexCOO(fbb, &indices).Union(); + + *fb_sparse_index = + flatbuf::CreateSparseTensorIndexCOO(fbb, indices_type_offset, fb_strides, &indices) + .Union(); *num_buffers = 1; return Status::OK(); } @@ -979,11 +993,28 @@ Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR; + + // We assume that the value type of indptr tensor is an integer. + const auto& indptr_value_type = + checked_cast(*sparse_index.indptr()->type()); + auto indptr_type_offset = flatbuf::CreateInt(fbb, indptr_value_type.bit_width(), + indptr_value_type.is_signed()); + const BufferMetadata& indptr_metadata = buffers[0]; - const BufferMetadata& indices_metadata = buffers[1]; flatbuf::Buffer indptr(indptr_metadata.offset, indptr_metadata.length); + + // We assume that the value type of indices tensor is an integer. + const auto& indices_value_type = + checked_cast(*sparse_index.indices()->type()); + auto indices_type_offset = flatbuf::CreateInt(fbb, indices_value_type.bit_width(), + indices_value_type.is_signed()); + + const BufferMetadata& indices_metadata = buffers[1]; flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); - *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, &indptr, &indices).Union(); + + *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, indptr_type_offset, &indptr, + indices_type_offset, &indices) + .Union(); *num_buffers = 2; return Status::OK(); } @@ -1189,6 +1220,19 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type); } +Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index, + std::shared_ptr* indices_type) { + return IntFromFlatbuffer(sparse_index->indicesType(), indices_type); +} + +Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index, + std::shared_ptr* indptr_type, + std::shared_ptr* indices_type) { + RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indptrType(), indptr_type)); + RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indicesType(), indices_type)); + return Status::OK(); +} + Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, std::vector* shape, std::vector* dim_names, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 94adf640ebdbe..828affd13f242 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -103,6 +103,15 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type std::vector* shape, std::vector* strides, std::vector* dim_names); +// EXPERIMENTAL: Extracting metadata of a SparseCOOIndex from the message +Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index, + std::shared_ptr* indices_type); + +// EXPERIMENTAL: Extracting metadata of a SparseCSRIndex from the message +Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index, + std::shared_ptr* indptr_type, + std::shared_ptr* indices_type); + // EXPERIMENTAL: Extracting metadata of a sparse tensor from the message Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, std::vector* shape, diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 2f1ac5e980157..9cbeacfe9836b 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1043,34 +1043,58 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { CheckTensorRoundTrip(tensor); } +template class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture { public: void SetUp() { IpcTestFixture::SetUp(); } void TearDown() { IpcTestFixture::TearDown(); } - template - void CheckSparseTensorRoundTrip(const SparseTensorImpl& tensor) { - GTEST_FAIL(); + void CheckSparseTensorRoundTrip(const SparseTensorCOO& sparse_tensor); + void CheckSparseTensorRoundTrip(const SparseTensorCSR& sparse_tensor); + + protected: + std::shared_ptr MakeSparseCOOIndex( + const std::vector& coords_shape, + const std::vector& coords_strides, + std::vector& coords_values) const { + auto coords_data = Buffer::Wrap(coords_values); + auto coords = std::make_shared>( + coords_data, coords_shape, coords_strides); + return std::make_shared(coords); + } + + template + std::shared_ptr MakeSparseTensorCOO( + const std::shared_ptr& si, std::vector& sparse_values, + const std::vector& shape, + const std::vector& dim_names = {}) const { + auto data = Buffer::Wrap(sparse_values); + return std::make_shared(si, CTypeTraits::type_singleton(), + data, shape, dim_names); } }; -template <> -void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseTensorImpl& tensor) { - const auto& type = checked_cast(*tensor.type()); +template +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorCOO& sparse_tensor) { + const auto& type = checked_cast(*sparse_tensor.type()); const int elem_size = type.bit_width() / 8; + const int index_elem_size = sizeof(typename IndexValueType::c_type); int32_t metadata_length; int64_t body_length; ASSERT_OK(mmap_->Seek(0)); - ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + ASSERT_OK(WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); - const auto& sparse_index = checked_cast(*tensor.sparse_index()); - const int64_t indices_length = elem_size * sparse_index.indices()->size(); - const int64_t data_length = elem_size * tensor.non_zero_length(); + const auto& sparse_index = + checked_cast(*sparse_tensor.sparse_index()); + const int64_t indices_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); + const int64_t data_length = + BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); const int64_t expected_body_length = indices_length + data_length; ASSERT_EQ(expected_body_length, body_length); @@ -1083,27 +1107,32 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( checked_cast(*result->sparse_index()); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); - ASSERT_TRUE(result->Equals(*result)); + ASSERT_TRUE(result->Equals(sparse_tensor)); } -template <> -void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseTensorImpl& tensor) { - const auto& type = checked_cast(*tensor.type()); +template +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorCSR& sparse_tensor) { + const auto& type = checked_cast(*sparse_tensor.type()); const int elem_size = type.bit_width() / 8; + const int index_elem_size = sizeof(typename IndexValueType::c_type); int32_t metadata_length; int64_t body_length; ASSERT_OK(mmap_->Seek(0)); - ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + ASSERT_OK(WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); - const auto& sparse_index = checked_cast(*tensor.sparse_index()); - const int64_t indptr_length = elem_size * sparse_index.indptr()->size(); - const int64_t indices_length = elem_size * sparse_index.indices()->size(); - const int64_t data_length = elem_size * tensor.non_zero_length(); + const auto& sparse_index = + checked_cast(*sparse_tensor.sparse_index()); + const int64_t indptr_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indptr()->size()); + const int64_t indices_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); + const int64_t data_length = + BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); const int64_t expected_body_length = indptr_length + indices_length + data_length; ASSERT_EQ(expected_body_length, body_length); @@ -1117,30 +1146,103 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); - ASSERT_TRUE(result->Equals(*result)); + ASSERT_TRUE(result->Equals(sparse_tensor)); } -TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) { +TYPED_TEST_CASE_P(TestSparseTensorRoundTrip); + +TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + std::string path = "test-write-sparse-coo-tensor"; constexpr int64_t kBufferSize = 1 << 20; - ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_)); + + // Dense representation: + // [ + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // ], + // [ + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + // ] + // + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + + std::vector coords_values = {0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 3, + 0, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 3, + 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value * 3, sizeof_index_value}, coords_values); std::vector shape = {2, 3, 4}; std::vector dim_names = {"foo", "bar", "baz"}; - std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, - 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::vector values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensorCOO(si, values, shape, dim_names); - auto data = Buffer::Wrap(values); - NumericTensor t(data, shape, {}, dim_names); - SparseTensorImpl st(t); + this->CheckSparseTensorRoundTrip(*st); +} + +TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexColumnMajor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::string path = "test-write-sparse-coo-tensor"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_)); + + // Dense representation: + // [ + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // ], + // [ + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + // ] + // + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + + std::vector coords_values = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, + 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value, sizeof_index_value * 12}, coords_values); - CheckSparseTensorRoundTrip(st); + std::vector shape = {2, 3, 4}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensorCOO(si, values, shape, dim_names); + + this->CheckSparseTensorRoundTrip(*st); } -TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { +TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSRIndex) { + using IndexValueType = TypeParam; + std::string path = "test-write-sparse-csr-matrix"; constexpr int64_t kBufferSize = 1 << 20; - ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_)); std::vector shape = {4, 6}; std::vector dim_names = {"foo", "bar", "baz"}; @@ -1149,11 +1251,23 @@ TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { auto data = Buffer::Wrap(values); NumericTensor t(data, shape, {}, dim_names); - SparseTensorImpl st(t); + SparseTensorImpl st(t, TypeTraits::type_singleton()); - CheckSparseTensorRoundTrip(st); + this->CheckSparseTensorRoundTrip(st); } +REGISTER_TYPED_TEST_CASE_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor, + WithSparseCOOIndexColumnMajor, WithSparseCSRIndex); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseTensorRoundTrip, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseTensorRoundTrip, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseTensorRoundTrip, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseTensorRoundTrip, UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseTensorRoundTrip, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseTensorRoundTrip, UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseTensorRoundTrip, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseTensorRoundTrip, UInt64Type); + TEST(TestRecordBatchStreamReader, MalformedInput) { const std::string empty_str = ""; const std::string garbage_str = "12345678"; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6fcbd92573468..be0bedf592526 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -860,27 +860,39 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { namespace { -Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, - int64_t non_zero_length, io::RandomAccessFile* file, - std::shared_ptr* out) { +Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, + const std::vector& shape, int64_t non_zero_length, + io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); + + std::shared_ptr indices_type; + RETURN_NOT_OK(internal::GetSparseCOOIndexMetadata(sparse_index, &indices_type)); + auto* indices_buffer = sparse_index->indicesBuffer(); std::shared_ptr indices_data; RETURN_NOT_OK( file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); - std::vector shape({non_zero_length, ndim}); - const int64_t elsize = sizeof(int64_t); - std::vector strides({elsize, elsize * non_zero_length}); + std::vector indices_shape( + {non_zero_length, static_cast(shape.size())}); + auto* indices_strides = sparse_index->indicesStrides(); + std::vector strides; + // Assume indices_strides is a 2-length array. + strides.push_back(indices_strides->Get(0)); + strides.push_back(indices_strides->Get(1)); *out = std::make_shared( - std::make_shared(indices_data, shape, strides)); + std::make_shared(indices_type, indices_data, indices_shape, strides)); return Status::OK(); } -Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, - int64_t non_zero_length, io::RandomAccessFile* file, - std::shared_ptr* out) { +Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, + const std::vector& shape, int64_t non_zero_length, + io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); + std::shared_ptr indptr_type, indices_type; + RETURN_NOT_OK( + internal::GetSparseCSRIndexMetadata(sparse_index, &indptr_type, &indices_type)); + auto* indptr_buffer = sparse_index->indptrBuffer(); std::shared_ptr indptr_data; RETURN_NOT_OK( @@ -891,11 +903,12 @@ Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t nd RETURN_NOT_OK( file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); - std::vector indptr_shape({ndim + 1}); + std::vector indptr_shape({shape[0] + 1}); std::vector indices_shape({non_zero_length}); + *out = std::make_shared( - std::make_shared(indptr_data, indptr_shape), - std::make_shared(indices_data, indices_shape)); + std::make_shared(indptr_type, indptr_data, indptr_shape), + std::make_shared(indices_type, indices_data, indices_shape)); return Status::OK(); } @@ -952,15 +965,15 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, std::shared_ptr sparse_index; switch (sparse_tensor_format_id) { case SparseTensorFormat::COO: - RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape.size(), non_zero_length, file, - &sparse_index)); + RETURN_NOT_OK( + ReadSparseCOOIndex(sparse_tensor, shape, non_zero_length, file, &sparse_index)); return MakeSparseTensorWithSparseCOOIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); case SparseTensorFormat::CSR: - RETURN_NOT_OK(ReadSparseCSRIndex(sparse_tensor, shape.size(), non_zero_length, file, - &sparse_index)); + RETURN_NOT_OK( + ReadSparseCSRIndex(sparse_tensor, shape, non_zero_length, file, &sparse_index)); return MakeSparseTensorWithSparseCSRIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index d90392df139fd..b6fe2f3a1e54f 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -18,11 +18,13 @@ #include "arrow/sparse_tensor.h" #include +#include #include #include #include "arrow/compare.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -34,7 +36,8 @@ namespace { template class SparseTensorConverter { public: - explicit SparseTensorConverter(const NumericTensor&) {} + explicit SparseTensorConverter(const NumericTensor&, + const std::shared_ptr&) {} Status Convert() { return Status::Invalid("Unsupported sparse index"); } }; @@ -47,9 +50,12 @@ struct SparseTensorConverterBase { using NumericTensorType = NumericTensor; using value_type = typename NumericTensorType::value_type; - explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {} + explicit SparseTensorConverterBase(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type) + : tensor_(tensor), index_value_type_(index_value_type) {} const NumericTensorType& tensor_; + const std::shared_ptr& index_value_type_; }; template @@ -60,17 +66,23 @@ class SparseTensorConverter using typename BaseClass::NumericTensorType; using typename BaseClass::value_type; - explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + explicit SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type) + : BaseClass(tensor, index_value_type) {} + template Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + const int64_t indices_elsize = sizeof(c_index_value_type); + const int64_t ndim = tensor_.ndim(); int64_t nonzero_count = -1; RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); std::shared_ptr indices_buffer; - RETURN_NOT_OK( - AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer)); - int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + RETURN_NOT_OK(AllocateBuffer(indices_elsize * ndim * nonzero_count, &indices_buffer)); + c_index_value_type* indices = + reinterpret_cast(indices_buffer->mutable_data()); std::shared_ptr values_buffer; RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); @@ -81,7 +93,7 @@ class SparseTensorConverter const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0]; for (int64_t i = 0; i < count; ++i, ++data) { if (*data != 0) { - *indices++ = i; + *indices++ = static_cast(i); *values++ = *data; } } @@ -94,9 +106,9 @@ class SparseTensorConverter if (tensor_.Value(coord) != 0) { *values++ = x; - int64_t* indp = indices; + c_index_value_type* indp = indices; for (int64_t i = 0; i < ndim; ++i) { - *indp = coord[i]; + *indp = static_cast(coord[i]); indp += nonzero_count; } indices++; @@ -117,21 +129,36 @@ class SparseTensorConverter // make results const std::vector indices_shape = {nonzero_count, ndim}; - const int64_t indices_elsize = sizeof(int64_t); const std::vector indices_strides = {indices_elsize, indices_elsize * nonzero_count}; - sparse_index = - std::make_shared(std::make_shared( - indices_buffer, indices_shape, indices_strides)); + sparse_index = std::make_shared(std::make_shared( + index_value_type_, indices_buffer, indices_shape, indices_strides)); data = values_buffer; return Status::OK(); } +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::Invalid("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + std::shared_ptr sparse_index; std::shared_ptr data; private: + using BaseClass::index_value_type_; using BaseClass::tensor_; }; @@ -146,12 +173,21 @@ class SparseTensorConverter using NumericTensorType = typename BaseClass::NumericTensorType; using value_type = typename BaseClass::value_type; - explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + explicit SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type) + : BaseClass(tensor, index_value_type) {} + template Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); + const int64_t indices_elsize = sizeof(c_index_value_type); + const int64_t ndim = tensor_.ndim(); if (ndim > 2) { + // LCOV_EXCL_START: The following invalid causes program failure. return Status::Invalid("Invalid tensor dimension"); + // LCOV_EXCL_STOP } const int64_t nr = tensor_.shape()[0]; @@ -169,20 +205,21 @@ class SparseTensorConverter if (ndim <= 1) { return Status::NotImplemented("TODO for ndim <= 1"); } else { - RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * (nr + 1), &indptr_buffer)); - int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + RETURN_NOT_OK(AllocateBuffer(indices_elsize * (nr + 1), &indptr_buffer)); + auto* indptr = reinterpret_cast(indptr_buffer->mutable_data()); - RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * nonzero_count, &indices_buffer)); - int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + RETURN_NOT_OK(AllocateBuffer(indices_elsize * nonzero_count, &indices_buffer)); + auto* indices = + reinterpret_cast(indices_buffer->mutable_data()); - int64_t k = 0; + c_index_value_type k = 0; *indptr++ = 0; for (int64_t i = 0; i < nr; ++i) { for (int64_t j = 0; j < nc; ++j) { const value_type x = tensor_.Value({i, j}); if (x != 0) { *values++ = x; - *indices++ = j; + *indices++ = static_cast(j); k++; } } @@ -191,12 +228,12 @@ class SparseTensorConverter } std::vector indptr_shape({nr + 1}); - std::shared_ptr indptr_tensor = - std::make_shared(indptr_buffer, indptr_shape); + std::shared_ptr indptr_tensor = + std::make_shared(index_value_type_, indptr_buffer, indptr_shape); std::vector indices_shape({nonzero_count}); - std::shared_ptr indices_tensor = - std::make_shared(indices_buffer, indices_shape); + std::shared_ptr indices_tensor = + std::make_shared(index_value_type_, indices_buffer, indices_shape); sparse_index = std::make_shared(indptr_tensor, indices_tensor); data = values_buffer; @@ -204,11 +241,42 @@ class SparseTensorConverter return Status::OK(); } +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::Invalid("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + std::shared_ptr sparse_index; std::shared_ptr data; private: + using BaseClass::index_value_type_; using BaseClass::tensor_; + + template + inline Status CheckMaximumValue(const c_value_type type_max) const { + if (static_cast(type_max) < tensor_.shape()[1]) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("The bit width of the index value type is too small"); + // LCOV_EXCL_STOP + } + return Status::OK(); + } + + inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); } + + inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; // ---------------------------------------------------------------------- @@ -238,76 +306,61 @@ namespace { template void MakeSparseTensorFromTensor(const Tensor& tensor, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data) { NumericTensor numeric_tensor(tensor.data(), tensor.shape(), tensor.strides()); - SparseTensorConverter converter(numeric_tensor); + SparseTensorConverter converter(numeric_tensor, + index_value_type); ARROW_CHECK_OK(converter.Convert()); *sparse_index = converter.sparse_index; *data = converter.data; } +#define MAKE_SPARSE_TENSOR_FROM_TENSOR(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + MakeSparseTensorFromTensor( \ + tensor, index_value_type, sparse_index, data); \ + break; + template inline void MakeSparseTensorFromTensor(const Tensor& tensor, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data) { switch (tensor.type()->id()) { - case Type::UINT8: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT8: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::UINT16: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT16: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::UINT32: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT32: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::UINT64: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT64: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::HALF_FLOAT: - MakeSparseTensorFromTensor(tensor, sparse_index, - data); - break; - case Type::FLOAT: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::DOUBLE: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; + ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(MAKE_SPARSE_TENSOR_FROM_TENSOR); + // LCOV_EXCL_START: ignore program failure default: ARROW_LOG(FATAL) << "Unsupported Tensor value type"; break; + // LCOV_EXCL_STOP } } +#undef MAKE_SPARSE_TENSOR_FROM_TENSOR + } // namespace void MakeSparseTensorFromTensor(const Tensor& tensor, SparseTensorFormat::type sparse_format_id, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data) { switch (sparse_format_id) { case SparseTensorFormat::COO: - MakeSparseTensorFromTensor(tensor, sparse_index, data); + MakeSparseTensorFromTensor(tensor, index_value_type, sparse_index, + data); break; case SparseTensorFormat::CSR: - MakeSparseTensorFromTensor(tensor, sparse_index, data); + MakeSparseTensorFromTensor(tensor, index_value_type, sparse_index, + data); break; + // LCOV_EXCL_START: ignore program failure default: ARROW_LOG(FATAL) << "Invalid sparse tensor format ID"; break; + // LCOV_EXCL_STOP } } @@ -316,10 +369,12 @@ void MakeSparseTensorFromTensor(const Tensor& tensor, // ---------------------------------------------------------------------- // SparseCOOIndex -// Constructor with a column-major NumericTensor -SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) +// Constructor with a contiguous NumericTensor +SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) : SparseIndexBase(coords->shape()[0]), coords_(coords) { - ARROW_CHECK(coords_->is_column_major()); + ARROW_CHECK(is_integer(coords_->type_id())); + ARROW_CHECK(coords_->is_contiguous()); + ARROW_CHECK_EQ(2, coords_->ndim()); } std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); } @@ -328,10 +383,12 @@ std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOInde // SparseCSRIndex // Constructor with two index vectors -SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices) +SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices) : SparseIndexBase(indices->shape()[0]), indptr_(indptr), indices_(indices) { + ARROW_CHECK(is_integer(indptr_->type_id())); ARROW_CHECK_EQ(1, indptr_->ndim()); + ARROW_CHECK(is_integer(indices_->type_id())); ARROW_CHECK_EQ(1, indices_->ndim()); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index b6fe4b205978e..2b31b4763346a 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -81,15 +81,13 @@ class SparseIndexBase : public SparseIndex { /// coordinates. class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase { public: - using CoordsTensor = NumericTensor; - static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO; // Constructor with a column-major NumericTensor - explicit SparseCOOIndex(const std::shared_ptr& coords); + explicit SparseCOOIndex(const std::shared_ptr& coords); /// \brief Return a tensor that has the coordinates of the non-zero values - const std::shared_ptr& indices() const { return coords_; } + const std::shared_ptr& indices() const { return coords_; } /// \brief Return a string representation of the sparse index std::string ToString() const override; @@ -100,7 +98,7 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase coords_; + std::shared_ptr coords_; }; // ---------------------------------------------------------------------- @@ -120,19 +118,17 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase { public: - using IndexTensor = NumericTensor; - static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; // Constructor with two index vectors - explicit SparseCSRIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices); + explicit SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices); /// \brief Return a 1D tensor of indptr vector - const std::shared_ptr& indptr() const { return indptr_; } + const std::shared_ptr& indptr() const { return indptr_; } /// \brief Return a 1D tensor of indices vector - const std::shared_ptr& indices() const { return indices_; } + const std::shared_ptr& indices() const { return indices_; } /// \brief Return a string representation of the sparse index std::string ToString() const override; @@ -143,8 +139,8 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase indptr_; - std::shared_ptr indices_; + std::shared_ptr indptr_; + std::shared_ptr indices_; }; // ---------------------------------------------------------------------- @@ -222,6 +218,7 @@ namespace internal { ARROW_EXPORT void MakeSparseTensorFromTensor(const Tensor& tensor, SparseTensorFormat::type sparse_format_id, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data); @@ -248,13 +245,16 @@ class SparseTensorImpl : public SparseTensor { : SparseTensorImpl(NULLPTR, type, NULLPTR, shape, dim_names) {} // Constructor with a dense tensor - explicit SparseTensorImpl(const Tensor& tensor) + SparseTensorImpl(const Tensor& tensor, + const std::shared_ptr& index_value_type) : SparseTensorImpl(NULLPTR, tensor.type(), NULLPTR, tensor.shape(), tensor.dim_names_) { internal::MakeSparseTensorFromTensor(tensor, SparseIndexType::format_id, - &sparse_index_, &data_); + index_value_type, &sparse_index_, &data_); } + explicit SparseTensorImpl(const Tensor& tensor) : SparseTensorImpl(tensor, int64()) {} + private: ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); }; diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 69ec4ca5c6052..a37f59c321170 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -39,21 +39,53 @@ static inline void CheckSparseIndexFormatType(SparseTensorFormat::type expected, ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_id()); } -static inline void AssertCOOIndex( - const std::shared_ptr& sidx, const int64_t nth, - const std::vector& expected_values) { +static inline void AssertCOOIndex(const std::shared_ptr& sidx, const int64_t nth, + const std::vector& expected_values) { int64_t n = static_cast(expected_values.size()); for (int64_t i = 0; i < n; ++i) { - ASSERT_EQ(expected_values[i], sidx->Value({nth, i})); + ASSERT_EQ(expected_values[i], sidx->Value({nth, i})); } } -TEST(TestSparseCOOTensor, CreationEmptyTensor) { - std::vector shape = {2, 3, 4}; - SparseTensorImpl st1(int64(), shape); +template +class TestSparseCOOTensorBase : public ::testing::Test { + public: + void SetUp() { + shape_ = {2, 3, 4}; + dim_names_ = {"foo", "bar", "baz"}; + + // Dense representation: + // [ + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // ], + // [ + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + // ] + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); + sparse_tensor_from_dense_ = std::make_shared( + dense_tensor, TypeTraits::type_singleton()); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCOOTensor : public TestSparseCOOTensorBase {}; - std::vector dim_names = {"foo", "bar", "baz"}; - SparseTensorImpl st2(int64(), shape, dim_names); +TEST_F(TestSparseCOOTensor, CreationEmptyTensor) { + SparseTensorImpl st1(int64(), this->shape_); + SparseTensorImpl st2(int64(), this->shape_, this->dim_names_); ASSERT_EQ(0, st1.non_zero_length()); ASSERT_EQ(0, st2.non_zero_length()); @@ -72,39 +104,20 @@ TEST(TestSparseCOOTensor, CreationEmptyTensor) { ASSERT_EQ("", st1.dim_name(2)); } -TEST(TestSparseCOOTensor, CreationFromNumericTensor) { - std::vector shape = {2, 3, 4}; - std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, - 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::shared_ptr buffer = Buffer::Wrap(values); - std::vector dim_names = {"foo", "bar", "baz"}; - NumericTensor tensor1(buffer, shape); - NumericTensor tensor2(buffer, shape, {}, dim_names); - SparseTensorImpl st1(tensor1); - SparseTensorImpl st2(tensor2); - - CheckSparseIndexFormatType(SparseTensorFormat::COO, st1); - - ASSERT_EQ(12, st1.non_zero_length()); - ASSERT_TRUE(st1.is_mutable()); +TEST_F(TestSparseCOOTensor, CreationFromNumericTensor) { + auto& st = *this->sparse_tensor_from_dense_; + CheckSparseIndexFormatType(SparseTensorFormat::COO, st); - ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st2.dim_names()); - ASSERT_EQ("foo", st2.dim_name(0)); - ASSERT_EQ("bar", st2.dim_name(1)); - ASSERT_EQ("baz", st2.dim_name(2)); - - ASSERT_EQ(std::vector({}), st1.dim_names()); - ASSERT_EQ("", st1.dim_name(0)); - ASSERT_EQ("", st1.dim_name(1)); - ASSERT_EQ("", st1.dim_name(2)); + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); - const int64_t* raw_data = reinterpret_cast(st1.raw_data()); + auto* raw_data = reinterpret_cast(st.raw_data()); AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); - const auto& si = internal::checked_cast(*st1.sparse_index()); + const auto& si = internal::checked_cast(*st.sparse_index()); ASSERT_EQ(std::string("SparseCOOIndex"), si.ToString()); - std::shared_ptr sidx = si.indices(); + std::shared_ptr sidx = si.indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -115,113 +128,264 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { AssertCOOIndex(sidx, 11, {1, 2, 3}); } -TEST(TestSparseCOOTensor, CreationFromTensor) { - std::vector shape = {2, 3, 4}; - std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, - 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::shared_ptr buffer = Buffer::Wrap(values); - std::vector dim_names = {"foo", "bar", "baz"}; - Tensor tensor1(int64(), buffer, shape); - Tensor tensor2(int64(), buffer, shape, {}, dim_names); - SparseTensorImpl st1(tensor1); - SparseTensorImpl st2(tensor2); +TEST_F(TestSparseCOOTensor, CreationFromNumericTensor1D) { + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + std::vector dense_shape({static_cast(dense_values.size())}); + NumericTensor dense_vector(dense_data, dense_shape); + SparseTensorImpl st(dense_vector); - ASSERT_EQ(12, st1.non_zero_length()); - ASSERT_TRUE(st1.is_mutable()); + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); - ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st2.dim_names()); - ASSERT_EQ("foo", st2.dim_name(0)); - ASSERT_EQ("bar", st2.dim_name(1)); - ASSERT_EQ("baz", st2.dim_name(2)); + auto* raw_data = reinterpret_cast(st.raw_data()); + AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); - ASSERT_EQ(std::vector({}), st1.dim_names()); - ASSERT_EQ("", st1.dim_name(0)); - ASSERT_EQ("", st1.dim_name(1)); - ASSERT_EQ("", st1.dim_name(2)); + const auto& si = internal::checked_cast(*st.sparse_index()); + auto sidx = si.indices(); + ASSERT_EQ(std::vector({12, 1}), sidx->shape()); + + AssertCOOIndex(sidx, 0, {0}); + AssertCOOIndex(sidx, 1, {2}); + AssertCOOIndex(sidx, 2, {5}); + AssertCOOIndex(sidx, 10, {21}); + AssertCOOIndex(sidx, 11, {23}); +} - const int64_t* raw_data = reinterpret_cast(st1.raw_data()); - AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); +TEST_F(TestSparseCOOTensor, CreationFromTensor) { + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, this->shape_, {}, this->dim_names_); + SparseTensorImpl st(tensor); - const auto& si = internal::checked_cast(*st1.sparse_index()); - std::shared_ptr sidx = si.indices(); - ASSERT_EQ(std::vector({12, 3}), sidx->shape()); - ASSERT_TRUE(sidx->is_column_major()); + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); - AssertCOOIndex(sidx, 0, {0, 0, 0}); - AssertCOOIndex(sidx, 1, {0, 0, 2}); - AssertCOOIndex(sidx, 2, {0, 1, 1}); - AssertCOOIndex(sidx, 10, {1, 2, 1}); - AssertCOOIndex(sidx, 11, {1, 2, 3}); + ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st.dim_names()); + ASSERT_EQ("foo", st.dim_name(0)); + ASSERT_EQ("bar", st.dim_name(1)); + ASSERT_EQ("baz", st.dim_name(2)); + + ASSERT_TRUE(st.Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCOOTensor, CreationFromNonContiguousTensor) { - std::vector shape = {2, 3, 4}; +TEST_F(TestSparseCOOTensor, CreationFromNonContiguousTensor) { std::vector values = {1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 11, 0, 0, 0, 12, 0, 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 16, 0}; std::vector strides = {192, 64, 16}; std::shared_ptr buffer = Buffer::Wrap(values); - Tensor tensor(int64(), buffer, shape, strides); + Tensor tensor(int64(), buffer, this->shape_, strides); SparseTensorImpl st(tensor); ASSERT_EQ(12, st.non_zero_length()); ASSERT_TRUE(st.is_mutable()); - const int64_t* raw_data = reinterpret_cast(st.raw_data()); - AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); - - const auto& si = internal::checked_cast(*st.sparse_index()); - std::shared_ptr sidx = si.indices(); - ASSERT_EQ(std::vector({12, 3}), sidx->shape()); - ASSERT_TRUE(sidx->is_column_major()); - - AssertCOOIndex(sidx, 0, {0, 0, 0}); - AssertCOOIndex(sidx, 1, {0, 0, 2}); - AssertCOOIndex(sidx, 2, {0, 1, 1}); - AssertCOOIndex(sidx, 10, {1, 2, 1}); - AssertCOOIndex(sidx, 11, {1, 2, 3}); + ASSERT_TRUE(st.Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCOOTensor, TensorEquality) { - std::vector shape = {2, 3, 4}; +TEST_F(TestSparseCOOTensor, TensorEquality) { std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::vector values2 = {0, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::shared_ptr buffer1 = Buffer::Wrap(values1); std::shared_ptr buffer2 = Buffer::Wrap(values2); - NumericTensor tensor1(buffer1, shape); - NumericTensor tensor2(buffer1, shape); - NumericTensor tensor3(buffer2, shape); + NumericTensor tensor1(buffer1, this->shape_); + NumericTensor tensor2(buffer2, this->shape_); SparseTensorImpl st1(tensor1); SparseTensorImpl st2(tensor2); - SparseTensorImpl st3(tensor3); - ASSERT_TRUE(st1.Equals(st2)); - ASSERT_TRUE(!st1.Equals(st3)); + ASSERT_TRUE(st1.Equals(*this->sparse_tensor_from_dense_)); + ASSERT_FALSE(st1.Equals(st2)); +} + +template +class TestSparseCOOTensorForIndexValueType + : public TestSparseCOOTensorBase { + protected: + std::shared_ptr MakeSparseCOOIndex( + const std::vector& coords_shape, + const std::vector& coords_strides, + std::vector& coords_values) const { + auto coords_data = Buffer::Wrap(coords_values); + auto coords = std::make_shared>( + coords_data, coords_shape, coords_strides); + return std::make_shared(coords); + } + + template + std::shared_ptr MakeSparseTensor( + const std::shared_ptr& si, + std::vector& sparse_values) const { + auto data = Buffer::Wrap(sparse_values); + return std::make_shared(si, + CTypeTraits::type_singleton(), + data, this->shape_, this->dim_names_); + } +}; + +TYPED_TEST_CASE_P(TestSparseCOOTensorForIndexValueType); + +TYPED_TEST_P(TestSparseCOOTensorForIndexValueType, CreationWithRowMajorIndex) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + std::vector coords_values = {0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 3, + 0, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 3, + 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value * 3, sizeof_index_value}, coords_values); + + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensor(si, sparse_values); + + ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st->dim_names()); + ASSERT_EQ("foo", st->dim_name(0)); + ASSERT_EQ("bar", st->dim_name(1)); + ASSERT_EQ("baz", st->dim_name(2)); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { - std::vector shape = {6, 4}; +TYPED_TEST_P(TestSparseCOOTensorForIndexValueType, CreationWithColumnMajorIndex) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + std::vector coords_values = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, + 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value, sizeof_index_value * 12}, coords_values); + + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensor(si, sparse_values); + + ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st->dim_names()); + ASSERT_EQ("foo", st->dim_name(0)); + ASSERT_EQ("bar", st->dim_name(1)); + ASSERT_EQ("baz", st->dim_name(2)); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCOOTensorForIndexValueType, + EqualityBetweenRowAndColumnMajorIndices) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + + // Row-major COO index + const std::vector coords_shape = {12, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + std::vector coords_values_row_major = { + 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 3, 0, 2, 0, 0, 2, 2, + 1, 0, 1, 1, 0, 3, 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 3}; + auto si_row_major = + this->MakeSparseCOOIndex(coords_shape, {sizeof_index_value * 3, sizeof_index_value}, + coords_values_row_major); + + // Column-major COO index + std::vector coords_values_col_major = { + 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 2, + 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}; + auto si_col_major = this->MakeSparseCOOIndex( + coords_shape, {sizeof_index_value, sizeof_index_value * 12}, + coords_values_col_major); + + std::vector sparse_values_1 = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st1 = this->MakeSparseTensor(si_row_major, sparse_values_1); + + std::vector sparse_values_2 = sparse_values_1; + auto st2 = this->MakeSparseTensor(si_row_major, sparse_values_2); + + ASSERT_TRUE(st2->Equals(*st1)); +} + +REGISTER_TYPED_TEST_CASE_P(TestSparseCOOTensorForIndexValueType, + CreationWithRowMajorIndex, CreationWithColumnMajorIndex, + EqualityBetweenRowAndColumnMajorIndices); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCOOTensorForIndexValueType, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCOOTensorForIndexValueType, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCOOTensorForIndexValueType, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseCOOTensorForIndexValueType, + UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCOOTensorForIndexValueType, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCOOTensorForIndexValueType, + UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCOOTensorForIndexValueType, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCOOTensorForIndexValueType, + UInt64Type); + +template +class TestSparseCSRMatrixBase : public ::testing::Test { + public: + void SetUp() { + shape_ = {6, 4}; + dim_names_ = {"foo", "bar"}; + + // Dense representation: + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); + sparse_tensor_from_dense_ = std::make_shared( + dense_tensor, TypeTraits::type_singleton()); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCSRMatrix : public TestSparseCSRMatrixBase {}; + +TEST_F(TestSparseCSRMatrix, CreationFromNumericTensor2D) { std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::shared_ptr buffer = Buffer::Wrap(values); - std::vector dim_names = {"foo", "bar", "baz"}; - NumericTensor tensor1(buffer, shape); - NumericTensor tensor2(buffer, shape, {}, dim_names); + NumericTensor tensor(buffer, this->shape_); - SparseTensorImpl st1(tensor1); - SparseTensorImpl st2(tensor2); + SparseTensorImpl st1(tensor); + SparseTensorImpl& st2 = *this->sparse_tensor_from_dense_; CheckSparseIndexFormatType(SparseTensorFormat::CSR, st1); ASSERT_EQ(12, st1.non_zero_length()); ASSERT_TRUE(st1.is_mutable()); - ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st2.dim_names()); + ASSERT_EQ(std::vector({"foo", "bar"}), st2.dim_names()); ASSERT_EQ("foo", st2.dim_name(0)); ASSERT_EQ("bar", st2.dim_name(1)); - ASSERT_EQ("baz", st2.dim_name(2)); ASSERT_EQ(std::vector({}), st1.dim_names()); ASSERT_EQ("", st1.dim_name(0)); @@ -252,14 +416,13 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); } -TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { - std::vector shape = {6, 4}; +TEST_F(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { std::vector values = {1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 11, 0, 0, 0, 12, 0, 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 16, 0}; std::vector strides = {64, 16}; std::shared_ptr buffer = Buffer::Wrap(values); - Tensor tensor(int64(), buffer, shape, strides); + Tensor tensor(int64(), buffer, this->shape_, strides); SparseTensorImpl st(tensor); ASSERT_EQ(12, st.non_zero_length()); @@ -286,26 +449,24 @@ TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { ASSERT_EQ(12, indices_values.size()); ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); + + ASSERT_TRUE(st.Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCSRMatrix, TensorEquality) { - std::vector shape = {6, 4}; +TEST_F(TestSparseCSRMatrix, TensorEquality) { std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::vector values2 = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + std::vector values2 = {9, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::shared_ptr buffer1 = Buffer::Wrap(values1); std::shared_ptr buffer2 = Buffer::Wrap(values2); - NumericTensor tensor1(buffer1, shape); - NumericTensor tensor2(buffer1, shape); - NumericTensor tensor3(buffer2, shape); + NumericTensor tensor1(buffer1, this->shape_); + NumericTensor tensor2(buffer2, this->shape_); SparseTensorImpl st1(tensor1); SparseTensorImpl st2(tensor2); - SparseTensorImpl st3(tensor3); - ASSERT_TRUE(st1.Equals(st2)); - ASSERT_TRUE(!st1.Equals(st3)); + ASSERT_TRUE(st1.Equals(*this->sparse_tensor_from_dense_)); + ASSERT_FALSE(st1.Equals(st2)); } } // namespace arrow diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 58ef6f7a50179..95a5c6ec6e120 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -31,39 +31,45 @@ namespace arrow { -#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ - ACTION(Null); \ - ACTION(Boolean); \ - ACTION(Int8); \ - ACTION(UInt8); \ - ACTION(Int16); \ - ACTION(UInt16); \ - ACTION(Int32); \ - ACTION(UInt32); \ - ACTION(Int64); \ - ACTION(UInt64); \ - ACTION(HalfFloat); \ - ACTION(Float); \ - ACTION(Double); \ - ACTION(String); \ - ACTION(Binary); \ - ACTION(LargeString); \ - ACTION(LargeBinary); \ - ACTION(FixedSizeBinary); \ - ACTION(Duration); \ - ACTION(Date32); \ - ACTION(Date64); \ - ACTION(Timestamp); \ - ACTION(Time32); \ - ACTION(Time64); \ - ACTION(Decimal128); \ - ACTION(List); \ - ACTION(LargeList); \ - ACTION(Map); \ - ACTION(FixedSizeList); \ - ACTION(Struct); \ - ACTION(Union); \ - ACTION(Dictionary); \ +#define ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION) \ + ACTION(Int8); \ + ACTION(UInt8); \ + ACTION(Int16); \ + ACTION(UInt16); \ + ACTION(Int32); \ + ACTION(UInt32); \ + ACTION(Int64); \ + ACTION(UInt64) + +#define ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION) \ + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION); \ + ACTION(HalfFloat); \ + ACTION(Float); \ + ACTION(Double) + +#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ + ACTION(Null); \ + ACTION(Boolean); \ + ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ + ACTION(String); \ + ACTION(Binary); \ + ACTION(LargeString); \ + ACTION(LargeBinary); \ + ACTION(FixedSizeBinary); \ + ACTION(Duration); \ + ACTION(Date32); \ + ACTION(Date64); \ + ACTION(Timestamp); \ + ACTION(Time32); \ + ACTION(Time64); \ + ACTION(Decimal128); \ + ACTION(List); \ + ACTION(LargeList); \ + ACTION(Map); \ + ACTION(FixedSizeList); \ + ACTION(Struct); \ + ACTION(Union); \ + ACTION(Dictionary); \ ACTION(Extension) #define TYPE_VISIT_INLINE(TYPE_CLASS) \ diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 853dd1985f930..96d954d1edf70 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -25,36 +25,50 @@ namespace org.apache.arrow.flatbuf; /// ---------------------------------------------------------------------- /// EXPERIMENTAL: Data structures for sparse tensors -/// Coodinate format of sparse tensor index. +/// Coodinate (COO) format of sparse tensor index. +/// +/// COO's index list are represented as a NxM matrix, +/// where N is the number of non-zero values, +/// and M is the number of dimensions of a sparse tensor. +/// +/// indicesBuffer stores the location and size of the data of this indices +/// matrix. The value type and the stride of the indices matrix is +/// specified in indicesType and indicesStrides fields. +/// +/// For example, let X be a 2x3x4x5 tensor, and it has the following +/// 6 non-zero values: +/// +/// X[0, 1, 2, 0] := 1 +/// X[1, 1, 2, 3] := 2 +/// X[0, 2, 1, 0] := 3 +/// X[0, 1, 3, 0] := 4 +/// X[0, 1, 2, 1] := 5 +/// X[1, 2, 0, 4] := 6 +/// +/// In COO format, the index matrix of X is the following 4x6 matrix: +/// +/// [[0, 0, 0, 0, 1, 1], +/// [1, 1, 1, 2, 1, 2], +/// [2, 2, 3, 1, 2, 0], +/// [0, 1, 0, 0, 3, 4]] +/// +/// Note that the indices are sorted in lexicographical order. table SparseTensorIndexCOO { - /// COO's index list are represented as a NxM matrix, - /// where N is the number of non-zero values, - /// and M is the number of dimensions of a sparse tensor. - /// indicesBuffer stores the location and size of this index matrix. - /// The type of index value is long, so the stride for the index matrix is unnecessary. - /// - /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: - /// - /// X[0, 1, 2, 0] := 1 - /// X[1, 1, 2, 3] := 2 - /// X[0, 2, 1, 0] := 3 - /// X[0, 1, 3, 0] := 4 - /// X[0, 1, 2, 1] := 5 - /// X[1, 2, 0, 4] := 6 - /// - /// In COO format, the index matrix of X is the following 4x6 matrix: - /// - /// [[0, 0, 0, 0, 1, 1], - /// [1, 1, 1, 2, 1, 2], - /// [2, 2, 3, 1, 2, 0], - /// [0, 1, 0, 0, 3, 4]] - /// - /// Note that the indices are sorted in lexicographical order. + /// The type of values in indicesBuffer + indicesType: Int; + + /// Non-negative byte offsets to advance one value cell along each dimension + indicesStrides: [long]; + + /// The location and size of the indices matrix's data indicesBuffer: Buffer; } /// Compressed Sparse Row format, that is matrix-specific. table SparseMatrixIndexCSR { + /// The type of values in indptrBuffer + indptrType: Int; + /// indptrBuffer stores the location and size of indptr array that /// represents the range of the rows. /// The i-th row spans from indptr[i] to indptr[i+1] in the data. @@ -79,6 +93,9 @@ table SparseMatrixIndexCSR { /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. indptrBuffer: Buffer; + /// The type of values in indicesBuffer + indicesType: Int; + /// indicesBuffer stores the location and size of the array that /// contains the column indices of the corresponding non-zero values. /// The type of index value is long.