diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 737cd7510a66e..6810351f982fa 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -967,9 +967,23 @@ Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseTensorIndexCOO; + + // We assume that the value type of indices tensor is an integer. + const auto& index_value_type = + checked_cast(*sparse_index.indices()->type()); + auto indices_type_offset = + flatbuf::CreateInt(fbb, index_value_type.bit_width(), index_value_type.is_signed()); + + auto fb_strides = + fbb.CreateVector(util::MakeNonNull(sparse_index.indices()->strides().data()), + sparse_index.indices()->strides().size()); + const BufferMetadata& indices_metadata = buffers[0]; flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); - *fb_sparse_index = flatbuf::CreateSparseTensorIndexCOO(fbb, &indices).Union(); + + *fb_sparse_index = + flatbuf::CreateSparseTensorIndexCOO(fbb, indices_type_offset, fb_strides, &indices) + .Union(); *num_buffers = 1; return Status::OK(); } @@ -979,11 +993,28 @@ Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR; + + // We assume that the value type of indptr tensor is an integer. + const auto& indptr_value_type = + checked_cast(*sparse_index.indptr()->type()); + auto indptr_type_offset = flatbuf::CreateInt(fbb, indptr_value_type.bit_width(), + indptr_value_type.is_signed()); + const BufferMetadata& indptr_metadata = buffers[0]; - const BufferMetadata& indices_metadata = buffers[1]; flatbuf::Buffer indptr(indptr_metadata.offset, indptr_metadata.length); + + // We assume that the value type of indices tensor is an integer. + const auto& indices_value_type = + checked_cast(*sparse_index.indices()->type()); + auto indices_type_offset = flatbuf::CreateInt(fbb, indices_value_type.bit_width(), + indices_value_type.is_signed()); + + const BufferMetadata& indices_metadata = buffers[1]; flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); - *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, &indptr, &indices).Union(); + + *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, indptr_type_offset, &indptr, + indices_type_offset, &indices) + .Union(); *num_buffers = 2; return Status::OK(); } @@ -1189,6 +1220,19 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type); } +Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index, + std::shared_ptr* indices_type) { + return IntFromFlatbuffer(sparse_index->indicesType(), indices_type); +} + +Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index, + std::shared_ptr* indptr_type, + std::shared_ptr* indices_type) { + RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indptrType(), indptr_type)); + RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indicesType(), indices_type)); + return Status::OK(); +} + Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, std::vector* shape, std::vector* dim_names, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 94adf640ebdbe..828affd13f242 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -103,6 +103,15 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type std::vector* shape, std::vector* strides, std::vector* dim_names); +// EXPERIMENTAL: Extracting metadata of a SparseCOOIndex from the message +Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index, + std::shared_ptr* indices_type); + +// EXPERIMENTAL: Extracting metadata of a SparseCSRIndex from the message +Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index, + std::shared_ptr* indptr_type, + std::shared_ptr* indices_type); + // EXPERIMENTAL: Extracting metadata of a sparse tensor from the message Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, std::vector* shape, diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 2f1ac5e980157..9cbeacfe9836b 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1043,34 +1043,58 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { CheckTensorRoundTrip(tensor); } +template class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture { public: void SetUp() { IpcTestFixture::SetUp(); } void TearDown() { IpcTestFixture::TearDown(); } - template - void CheckSparseTensorRoundTrip(const SparseTensorImpl& tensor) { - GTEST_FAIL(); + void CheckSparseTensorRoundTrip(const SparseTensorCOO& sparse_tensor); + void CheckSparseTensorRoundTrip(const SparseTensorCSR& sparse_tensor); + + protected: + std::shared_ptr MakeSparseCOOIndex( + const std::vector& coords_shape, + const std::vector& coords_strides, + std::vector& coords_values) const { + auto coords_data = Buffer::Wrap(coords_values); + auto coords = std::make_shared>( + coords_data, coords_shape, coords_strides); + return std::make_shared(coords); + } + + template + std::shared_ptr MakeSparseTensorCOO( + const std::shared_ptr& si, std::vector& sparse_values, + const std::vector& shape, + const std::vector& dim_names = {}) const { + auto data = Buffer::Wrap(sparse_values); + return std::make_shared(si, CTypeTraits::type_singleton(), + data, shape, dim_names); } }; -template <> -void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseTensorImpl& tensor) { - const auto& type = checked_cast(*tensor.type()); +template +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorCOO& sparse_tensor) { + const auto& type = checked_cast(*sparse_tensor.type()); const int elem_size = type.bit_width() / 8; + const int index_elem_size = sizeof(typename IndexValueType::c_type); int32_t metadata_length; int64_t body_length; ASSERT_OK(mmap_->Seek(0)); - ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + ASSERT_OK(WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); - const auto& sparse_index = checked_cast(*tensor.sparse_index()); - const int64_t indices_length = elem_size * sparse_index.indices()->size(); - const int64_t data_length = elem_size * tensor.non_zero_length(); + const auto& sparse_index = + checked_cast(*sparse_tensor.sparse_index()); + const int64_t indices_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); + const int64_t data_length = + BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); const int64_t expected_body_length = indices_length + data_length; ASSERT_EQ(expected_body_length, body_length); @@ -1083,27 +1107,32 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( checked_cast(*result->sparse_index()); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); - ASSERT_TRUE(result->Equals(*result)); + ASSERT_TRUE(result->Equals(sparse_tensor)); } -template <> -void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseTensorImpl& tensor) { - const auto& type = checked_cast(*tensor.type()); +template +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorCSR& sparse_tensor) { + const auto& type = checked_cast(*sparse_tensor.type()); const int elem_size = type.bit_width() / 8; + const int index_elem_size = sizeof(typename IndexValueType::c_type); int32_t metadata_length; int64_t body_length; ASSERT_OK(mmap_->Seek(0)); - ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + ASSERT_OK(WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); - const auto& sparse_index = checked_cast(*tensor.sparse_index()); - const int64_t indptr_length = elem_size * sparse_index.indptr()->size(); - const int64_t indices_length = elem_size * sparse_index.indices()->size(); - const int64_t data_length = elem_size * tensor.non_zero_length(); + const auto& sparse_index = + checked_cast(*sparse_tensor.sparse_index()); + const int64_t indptr_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indptr()->size()); + const int64_t indices_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); + const int64_t data_length = + BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); const int64_t expected_body_length = indptr_length + indices_length + data_length; ASSERT_EQ(expected_body_length, body_length); @@ -1117,30 +1146,103 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); - ASSERT_TRUE(result->Equals(*result)); + ASSERT_TRUE(result->Equals(sparse_tensor)); } -TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) { +TYPED_TEST_CASE_P(TestSparseTensorRoundTrip); + +TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + std::string path = "test-write-sparse-coo-tensor"; constexpr int64_t kBufferSize = 1 << 20; - ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_)); + + // Dense representation: + // [ + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // ], + // [ + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + // ] + // + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + + std::vector coords_values = {0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 3, + 0, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 3, + 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value * 3, sizeof_index_value}, coords_values); std::vector shape = {2, 3, 4}; std::vector dim_names = {"foo", "bar", "baz"}; - std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, - 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::vector values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensorCOO(si, values, shape, dim_names); - auto data = Buffer::Wrap(values); - NumericTensor t(data, shape, {}, dim_names); - SparseTensorImpl st(t); + this->CheckSparseTensorRoundTrip(*st); +} + +TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexColumnMajor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::string path = "test-write-sparse-coo-tensor"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_)); + + // Dense representation: + // [ + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // ], + // [ + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + // ] + // + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + + std::vector coords_values = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, + 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value, sizeof_index_value * 12}, coords_values); - CheckSparseTensorRoundTrip(st); + std::vector shape = {2, 3, 4}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensorCOO(si, values, shape, dim_names); + + this->CheckSparseTensorRoundTrip(*st); } -TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { +TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSRIndex) { + using IndexValueType = TypeParam; + std::string path = "test-write-sparse-csr-matrix"; constexpr int64_t kBufferSize = 1 << 20; - ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &this->mmap_)); std::vector shape = {4, 6}; std::vector dim_names = {"foo", "bar", "baz"}; @@ -1149,11 +1251,23 @@ TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { auto data = Buffer::Wrap(values); NumericTensor t(data, shape, {}, dim_names); - SparseTensorImpl st(t); + SparseTensorImpl st(t, TypeTraits::type_singleton()); - CheckSparseTensorRoundTrip(st); + this->CheckSparseTensorRoundTrip(st); } +REGISTER_TYPED_TEST_CASE_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor, + WithSparseCOOIndexColumnMajor, WithSparseCSRIndex); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseTensorRoundTrip, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseTensorRoundTrip, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseTensorRoundTrip, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseTensorRoundTrip, UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseTensorRoundTrip, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseTensorRoundTrip, UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseTensorRoundTrip, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseTensorRoundTrip, UInt64Type); + TEST(TestRecordBatchStreamReader, MalformedInput) { const std::string empty_str = ""; const std::string garbage_str = "12345678"; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6fcbd92573468..be0bedf592526 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -860,27 +860,39 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { namespace { -Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, - int64_t non_zero_length, io::RandomAccessFile* file, - std::shared_ptr* out) { +Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, + const std::vector& shape, int64_t non_zero_length, + io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); + + std::shared_ptr indices_type; + RETURN_NOT_OK(internal::GetSparseCOOIndexMetadata(sparse_index, &indices_type)); + auto* indices_buffer = sparse_index->indicesBuffer(); std::shared_ptr indices_data; RETURN_NOT_OK( file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); - std::vector shape({non_zero_length, ndim}); - const int64_t elsize = sizeof(int64_t); - std::vector strides({elsize, elsize * non_zero_length}); + std::vector indices_shape( + {non_zero_length, static_cast(shape.size())}); + auto* indices_strides = sparse_index->indicesStrides(); + std::vector strides; + // Assume indices_strides is a 2-length array. + strides.push_back(indices_strides->Get(0)); + strides.push_back(indices_strides->Get(1)); *out = std::make_shared( - std::make_shared(indices_data, shape, strides)); + std::make_shared(indices_type, indices_data, indices_shape, strides)); return Status::OK(); } -Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, - int64_t non_zero_length, io::RandomAccessFile* file, - std::shared_ptr* out) { +Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, + const std::vector& shape, int64_t non_zero_length, + io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); + std::shared_ptr indptr_type, indices_type; + RETURN_NOT_OK( + internal::GetSparseCSRIndexMetadata(sparse_index, &indptr_type, &indices_type)); + auto* indptr_buffer = sparse_index->indptrBuffer(); std::shared_ptr indptr_data; RETURN_NOT_OK( @@ -891,11 +903,12 @@ Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t nd RETURN_NOT_OK( file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); - std::vector indptr_shape({ndim + 1}); + std::vector indptr_shape({shape[0] + 1}); std::vector indices_shape({non_zero_length}); + *out = std::make_shared( - std::make_shared(indptr_data, indptr_shape), - std::make_shared(indices_data, indices_shape)); + std::make_shared(indptr_type, indptr_data, indptr_shape), + std::make_shared(indices_type, indices_data, indices_shape)); return Status::OK(); } @@ -952,15 +965,15 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, std::shared_ptr sparse_index; switch (sparse_tensor_format_id) { case SparseTensorFormat::COO: - RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape.size(), non_zero_length, file, - &sparse_index)); + RETURN_NOT_OK( + ReadSparseCOOIndex(sparse_tensor, shape, non_zero_length, file, &sparse_index)); return MakeSparseTensorWithSparseCOOIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); case SparseTensorFormat::CSR: - RETURN_NOT_OK(ReadSparseCSRIndex(sparse_tensor, shape.size(), non_zero_length, file, - &sparse_index)); + RETURN_NOT_OK( + ReadSparseCSRIndex(sparse_tensor, shape, non_zero_length, file, &sparse_index)); return MakeSparseTensorWithSparseCSRIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index d90392df139fd..b6fe2f3a1e54f 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -18,11 +18,13 @@ #include "arrow/sparse_tensor.h" #include +#include #include #include #include "arrow/compare.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -34,7 +36,8 @@ namespace { template class SparseTensorConverter { public: - explicit SparseTensorConverter(const NumericTensor&) {} + explicit SparseTensorConverter(const NumericTensor&, + const std::shared_ptr&) {} Status Convert() { return Status::Invalid("Unsupported sparse index"); } }; @@ -47,9 +50,12 @@ struct SparseTensorConverterBase { using NumericTensorType = NumericTensor; using value_type = typename NumericTensorType::value_type; - explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {} + explicit SparseTensorConverterBase(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type) + : tensor_(tensor), index_value_type_(index_value_type) {} const NumericTensorType& tensor_; + const std::shared_ptr& index_value_type_; }; template @@ -60,17 +66,23 @@ class SparseTensorConverter using typename BaseClass::NumericTensorType; using typename BaseClass::value_type; - explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + explicit SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type) + : BaseClass(tensor, index_value_type) {} + template Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + const int64_t indices_elsize = sizeof(c_index_value_type); + const int64_t ndim = tensor_.ndim(); int64_t nonzero_count = -1; RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); std::shared_ptr indices_buffer; - RETURN_NOT_OK( - AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer)); - int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + RETURN_NOT_OK(AllocateBuffer(indices_elsize * ndim * nonzero_count, &indices_buffer)); + c_index_value_type* indices = + reinterpret_cast(indices_buffer->mutable_data()); std::shared_ptr values_buffer; RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); @@ -81,7 +93,7 @@ class SparseTensorConverter const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0]; for (int64_t i = 0; i < count; ++i, ++data) { if (*data != 0) { - *indices++ = i; + *indices++ = static_cast(i); *values++ = *data; } } @@ -94,9 +106,9 @@ class SparseTensorConverter if (tensor_.Value(coord) != 0) { *values++ = x; - int64_t* indp = indices; + c_index_value_type* indp = indices; for (int64_t i = 0; i < ndim; ++i) { - *indp = coord[i]; + *indp = static_cast(coord[i]); indp += nonzero_count; } indices++; @@ -117,21 +129,36 @@ class SparseTensorConverter // make results const std::vector indices_shape = {nonzero_count, ndim}; - const int64_t indices_elsize = sizeof(int64_t); const std::vector indices_strides = {indices_elsize, indices_elsize * nonzero_count}; - sparse_index = - std::make_shared(std::make_shared( - indices_buffer, indices_shape, indices_strides)); + sparse_index = std::make_shared(std::make_shared( + index_value_type_, indices_buffer, indices_shape, indices_strides)); data = values_buffer; return Status::OK(); } +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::Invalid("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + std::shared_ptr sparse_index; std::shared_ptr data; private: + using BaseClass::index_value_type_; using BaseClass::tensor_; }; @@ -146,12 +173,21 @@ class SparseTensorConverter using NumericTensorType = typename BaseClass::NumericTensorType; using value_type = typename BaseClass::value_type; - explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + explicit SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type) + : BaseClass(tensor, index_value_type) {} + template Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); + const int64_t indices_elsize = sizeof(c_index_value_type); + const int64_t ndim = tensor_.ndim(); if (ndim > 2) { + // LCOV_EXCL_START: The following invalid causes program failure. return Status::Invalid("Invalid tensor dimension"); + // LCOV_EXCL_STOP } const int64_t nr = tensor_.shape()[0]; @@ -169,20 +205,21 @@ class SparseTensorConverter if (ndim <= 1) { return Status::NotImplemented("TODO for ndim <= 1"); } else { - RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * (nr + 1), &indptr_buffer)); - int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + RETURN_NOT_OK(AllocateBuffer(indices_elsize * (nr + 1), &indptr_buffer)); + auto* indptr = reinterpret_cast(indptr_buffer->mutable_data()); - RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * nonzero_count, &indices_buffer)); - int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + RETURN_NOT_OK(AllocateBuffer(indices_elsize * nonzero_count, &indices_buffer)); + auto* indices = + reinterpret_cast(indices_buffer->mutable_data()); - int64_t k = 0; + c_index_value_type k = 0; *indptr++ = 0; for (int64_t i = 0; i < nr; ++i) { for (int64_t j = 0; j < nc; ++j) { const value_type x = tensor_.Value({i, j}); if (x != 0) { *values++ = x; - *indices++ = j; + *indices++ = static_cast(j); k++; } } @@ -191,12 +228,12 @@ class SparseTensorConverter } std::vector indptr_shape({nr + 1}); - std::shared_ptr indptr_tensor = - std::make_shared(indptr_buffer, indptr_shape); + std::shared_ptr indptr_tensor = + std::make_shared(index_value_type_, indptr_buffer, indptr_shape); std::vector indices_shape({nonzero_count}); - std::shared_ptr indices_tensor = - std::make_shared(indices_buffer, indices_shape); + std::shared_ptr indices_tensor = + std::make_shared(index_value_type_, indices_buffer, indices_shape); sparse_index = std::make_shared(indptr_tensor, indices_tensor); data = values_buffer; @@ -204,11 +241,42 @@ class SparseTensorConverter return Status::OK(); } +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::Invalid("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + std::shared_ptr sparse_index; std::shared_ptr data; private: + using BaseClass::index_value_type_; using BaseClass::tensor_; + + template + inline Status CheckMaximumValue(const c_value_type type_max) const { + if (static_cast(type_max) < tensor_.shape()[1]) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("The bit width of the index value type is too small"); + // LCOV_EXCL_STOP + } + return Status::OK(); + } + + inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); } + + inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; // ---------------------------------------------------------------------- @@ -238,76 +306,61 @@ namespace { template void MakeSparseTensorFromTensor(const Tensor& tensor, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data) { NumericTensor numeric_tensor(tensor.data(), tensor.shape(), tensor.strides()); - SparseTensorConverter converter(numeric_tensor); + SparseTensorConverter converter(numeric_tensor, + index_value_type); ARROW_CHECK_OK(converter.Convert()); *sparse_index = converter.sparse_index; *data = converter.data; } +#define MAKE_SPARSE_TENSOR_FROM_TENSOR(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + MakeSparseTensorFromTensor( \ + tensor, index_value_type, sparse_index, data); \ + break; + template inline void MakeSparseTensorFromTensor(const Tensor& tensor, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data) { switch (tensor.type()->id()) { - case Type::UINT8: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT8: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::UINT16: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT16: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::UINT32: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT32: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::UINT64: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::INT64: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::HALF_FLOAT: - MakeSparseTensorFromTensor(tensor, sparse_index, - data); - break; - case Type::FLOAT: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; - case Type::DOUBLE: - MakeSparseTensorFromTensor(tensor, sparse_index, data); - break; + ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(MAKE_SPARSE_TENSOR_FROM_TENSOR); + // LCOV_EXCL_START: ignore program failure default: ARROW_LOG(FATAL) << "Unsupported Tensor value type"; break; + // LCOV_EXCL_STOP } } +#undef MAKE_SPARSE_TENSOR_FROM_TENSOR + } // namespace void MakeSparseTensorFromTensor(const Tensor& tensor, SparseTensorFormat::type sparse_format_id, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data) { switch (sparse_format_id) { case SparseTensorFormat::COO: - MakeSparseTensorFromTensor(tensor, sparse_index, data); + MakeSparseTensorFromTensor(tensor, index_value_type, sparse_index, + data); break; case SparseTensorFormat::CSR: - MakeSparseTensorFromTensor(tensor, sparse_index, data); + MakeSparseTensorFromTensor(tensor, index_value_type, sparse_index, + data); break; + // LCOV_EXCL_START: ignore program failure default: ARROW_LOG(FATAL) << "Invalid sparse tensor format ID"; break; + // LCOV_EXCL_STOP } } @@ -316,10 +369,12 @@ void MakeSparseTensorFromTensor(const Tensor& tensor, // ---------------------------------------------------------------------- // SparseCOOIndex -// Constructor with a column-major NumericTensor -SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) +// Constructor with a contiguous NumericTensor +SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) : SparseIndexBase(coords->shape()[0]), coords_(coords) { - ARROW_CHECK(coords_->is_column_major()); + ARROW_CHECK(is_integer(coords_->type_id())); + ARROW_CHECK(coords_->is_contiguous()); + ARROW_CHECK_EQ(2, coords_->ndim()); } std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); } @@ -328,10 +383,12 @@ std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOInde // SparseCSRIndex // Constructor with two index vectors -SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices) +SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices) : SparseIndexBase(indices->shape()[0]), indptr_(indptr), indices_(indices) { + ARROW_CHECK(is_integer(indptr_->type_id())); ARROW_CHECK_EQ(1, indptr_->ndim()); + ARROW_CHECK(is_integer(indices_->type_id())); ARROW_CHECK_EQ(1, indices_->ndim()); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index b6fe4b205978e..2b31b4763346a 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -81,15 +81,13 @@ class SparseIndexBase : public SparseIndex { /// coordinates. class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase { public: - using CoordsTensor = NumericTensor; - static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO; // Constructor with a column-major NumericTensor - explicit SparseCOOIndex(const std::shared_ptr& coords); + explicit SparseCOOIndex(const std::shared_ptr& coords); /// \brief Return a tensor that has the coordinates of the non-zero values - const std::shared_ptr& indices() const { return coords_; } + const std::shared_ptr& indices() const { return coords_; } /// \brief Return a string representation of the sparse index std::string ToString() const override; @@ -100,7 +98,7 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase coords_; + std::shared_ptr coords_; }; // ---------------------------------------------------------------------- @@ -120,19 +118,17 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase { public: - using IndexTensor = NumericTensor; - static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; // Constructor with two index vectors - explicit SparseCSRIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices); + explicit SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices); /// \brief Return a 1D tensor of indptr vector - const std::shared_ptr& indptr() const { return indptr_; } + const std::shared_ptr& indptr() const { return indptr_; } /// \brief Return a 1D tensor of indices vector - const std::shared_ptr& indices() const { return indices_; } + const std::shared_ptr& indices() const { return indices_; } /// \brief Return a string representation of the sparse index std::string ToString() const override; @@ -143,8 +139,8 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase indptr_; - std::shared_ptr indices_; + std::shared_ptr indptr_; + std::shared_ptr indices_; }; // ---------------------------------------------------------------------- @@ -222,6 +218,7 @@ namespace internal { ARROW_EXPORT void MakeSparseTensorFromTensor(const Tensor& tensor, SparseTensorFormat::type sparse_format_id, + const std::shared_ptr& index_value_type, std::shared_ptr* sparse_index, std::shared_ptr* data); @@ -248,13 +245,16 @@ class SparseTensorImpl : public SparseTensor { : SparseTensorImpl(NULLPTR, type, NULLPTR, shape, dim_names) {} // Constructor with a dense tensor - explicit SparseTensorImpl(const Tensor& tensor) + SparseTensorImpl(const Tensor& tensor, + const std::shared_ptr& index_value_type) : SparseTensorImpl(NULLPTR, tensor.type(), NULLPTR, tensor.shape(), tensor.dim_names_) { internal::MakeSparseTensorFromTensor(tensor, SparseIndexType::format_id, - &sparse_index_, &data_); + index_value_type, &sparse_index_, &data_); } + explicit SparseTensorImpl(const Tensor& tensor) : SparseTensorImpl(tensor, int64()) {} + private: ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); }; diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 69ec4ca5c6052..a37f59c321170 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -39,21 +39,53 @@ static inline void CheckSparseIndexFormatType(SparseTensorFormat::type expected, ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_id()); } -static inline void AssertCOOIndex( - const std::shared_ptr& sidx, const int64_t nth, - const std::vector& expected_values) { +static inline void AssertCOOIndex(const std::shared_ptr& sidx, const int64_t nth, + const std::vector& expected_values) { int64_t n = static_cast(expected_values.size()); for (int64_t i = 0; i < n; ++i) { - ASSERT_EQ(expected_values[i], sidx->Value({nth, i})); + ASSERT_EQ(expected_values[i], sidx->Value({nth, i})); } } -TEST(TestSparseCOOTensor, CreationEmptyTensor) { - std::vector shape = {2, 3, 4}; - SparseTensorImpl st1(int64(), shape); +template +class TestSparseCOOTensorBase : public ::testing::Test { + public: + void SetUp() { + shape_ = {2, 3, 4}; + dim_names_ = {"foo", "bar", "baz"}; + + // Dense representation: + // [ + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // ], + // [ + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + // ] + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); + sparse_tensor_from_dense_ = std::make_shared( + dense_tensor, TypeTraits::type_singleton()); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCOOTensor : public TestSparseCOOTensorBase {}; - std::vector dim_names = {"foo", "bar", "baz"}; - SparseTensorImpl st2(int64(), shape, dim_names); +TEST_F(TestSparseCOOTensor, CreationEmptyTensor) { + SparseTensorImpl st1(int64(), this->shape_); + SparseTensorImpl st2(int64(), this->shape_, this->dim_names_); ASSERT_EQ(0, st1.non_zero_length()); ASSERT_EQ(0, st2.non_zero_length()); @@ -72,39 +104,20 @@ TEST(TestSparseCOOTensor, CreationEmptyTensor) { ASSERT_EQ("", st1.dim_name(2)); } -TEST(TestSparseCOOTensor, CreationFromNumericTensor) { - std::vector shape = {2, 3, 4}; - std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, - 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::shared_ptr buffer = Buffer::Wrap(values); - std::vector dim_names = {"foo", "bar", "baz"}; - NumericTensor tensor1(buffer, shape); - NumericTensor tensor2(buffer, shape, {}, dim_names); - SparseTensorImpl st1(tensor1); - SparseTensorImpl st2(tensor2); - - CheckSparseIndexFormatType(SparseTensorFormat::COO, st1); - - ASSERT_EQ(12, st1.non_zero_length()); - ASSERT_TRUE(st1.is_mutable()); +TEST_F(TestSparseCOOTensor, CreationFromNumericTensor) { + auto& st = *this->sparse_tensor_from_dense_; + CheckSparseIndexFormatType(SparseTensorFormat::COO, st); - ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st2.dim_names()); - ASSERT_EQ("foo", st2.dim_name(0)); - ASSERT_EQ("bar", st2.dim_name(1)); - ASSERT_EQ("baz", st2.dim_name(2)); - - ASSERT_EQ(std::vector({}), st1.dim_names()); - ASSERT_EQ("", st1.dim_name(0)); - ASSERT_EQ("", st1.dim_name(1)); - ASSERT_EQ("", st1.dim_name(2)); + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); - const int64_t* raw_data = reinterpret_cast(st1.raw_data()); + auto* raw_data = reinterpret_cast(st.raw_data()); AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); - const auto& si = internal::checked_cast(*st1.sparse_index()); + const auto& si = internal::checked_cast(*st.sparse_index()); ASSERT_EQ(std::string("SparseCOOIndex"), si.ToString()); - std::shared_ptr sidx = si.indices(); + std::shared_ptr sidx = si.indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -115,113 +128,264 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { AssertCOOIndex(sidx, 11, {1, 2, 3}); } -TEST(TestSparseCOOTensor, CreationFromTensor) { - std::vector shape = {2, 3, 4}; - std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, - 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::shared_ptr buffer = Buffer::Wrap(values); - std::vector dim_names = {"foo", "bar", "baz"}; - Tensor tensor1(int64(), buffer, shape); - Tensor tensor2(int64(), buffer, shape, {}, dim_names); - SparseTensorImpl st1(tensor1); - SparseTensorImpl st2(tensor2); +TEST_F(TestSparseCOOTensor, CreationFromNumericTensor1D) { + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + std::vector dense_shape({static_cast(dense_values.size())}); + NumericTensor dense_vector(dense_data, dense_shape); + SparseTensorImpl st(dense_vector); - ASSERT_EQ(12, st1.non_zero_length()); - ASSERT_TRUE(st1.is_mutable()); + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); - ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st2.dim_names()); - ASSERT_EQ("foo", st2.dim_name(0)); - ASSERT_EQ("bar", st2.dim_name(1)); - ASSERT_EQ("baz", st2.dim_name(2)); + auto* raw_data = reinterpret_cast(st.raw_data()); + AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); - ASSERT_EQ(std::vector({}), st1.dim_names()); - ASSERT_EQ("", st1.dim_name(0)); - ASSERT_EQ("", st1.dim_name(1)); - ASSERT_EQ("", st1.dim_name(2)); + const auto& si = internal::checked_cast(*st.sparse_index()); + auto sidx = si.indices(); + ASSERT_EQ(std::vector({12, 1}), sidx->shape()); + + AssertCOOIndex(sidx, 0, {0}); + AssertCOOIndex(sidx, 1, {2}); + AssertCOOIndex(sidx, 2, {5}); + AssertCOOIndex(sidx, 10, {21}); + AssertCOOIndex(sidx, 11, {23}); +} - const int64_t* raw_data = reinterpret_cast(st1.raw_data()); - AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); +TEST_F(TestSparseCOOTensor, CreationFromTensor) { + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, this->shape_, {}, this->dim_names_); + SparseTensorImpl st(tensor); - const auto& si = internal::checked_cast(*st1.sparse_index()); - std::shared_ptr sidx = si.indices(); - ASSERT_EQ(std::vector({12, 3}), sidx->shape()); - ASSERT_TRUE(sidx->is_column_major()); + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); - AssertCOOIndex(sidx, 0, {0, 0, 0}); - AssertCOOIndex(sidx, 1, {0, 0, 2}); - AssertCOOIndex(sidx, 2, {0, 1, 1}); - AssertCOOIndex(sidx, 10, {1, 2, 1}); - AssertCOOIndex(sidx, 11, {1, 2, 3}); + ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st.dim_names()); + ASSERT_EQ("foo", st.dim_name(0)); + ASSERT_EQ("bar", st.dim_name(1)); + ASSERT_EQ("baz", st.dim_name(2)); + + ASSERT_TRUE(st.Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCOOTensor, CreationFromNonContiguousTensor) { - std::vector shape = {2, 3, 4}; +TEST_F(TestSparseCOOTensor, CreationFromNonContiguousTensor) { std::vector values = {1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 11, 0, 0, 0, 12, 0, 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 16, 0}; std::vector strides = {192, 64, 16}; std::shared_ptr buffer = Buffer::Wrap(values); - Tensor tensor(int64(), buffer, shape, strides); + Tensor tensor(int64(), buffer, this->shape_, strides); SparseTensorImpl st(tensor); ASSERT_EQ(12, st.non_zero_length()); ASSERT_TRUE(st.is_mutable()); - const int64_t* raw_data = reinterpret_cast(st.raw_data()); - AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); - - const auto& si = internal::checked_cast(*st.sparse_index()); - std::shared_ptr sidx = si.indices(); - ASSERT_EQ(std::vector({12, 3}), sidx->shape()); - ASSERT_TRUE(sidx->is_column_major()); - - AssertCOOIndex(sidx, 0, {0, 0, 0}); - AssertCOOIndex(sidx, 1, {0, 0, 2}); - AssertCOOIndex(sidx, 2, {0, 1, 1}); - AssertCOOIndex(sidx, 10, {1, 2, 1}); - AssertCOOIndex(sidx, 11, {1, 2, 3}); + ASSERT_TRUE(st.Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCOOTensor, TensorEquality) { - std::vector shape = {2, 3, 4}; +TEST_F(TestSparseCOOTensor, TensorEquality) { std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::vector values2 = {0, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::shared_ptr buffer1 = Buffer::Wrap(values1); std::shared_ptr buffer2 = Buffer::Wrap(values2); - NumericTensor tensor1(buffer1, shape); - NumericTensor tensor2(buffer1, shape); - NumericTensor tensor3(buffer2, shape); + NumericTensor tensor1(buffer1, this->shape_); + NumericTensor tensor2(buffer2, this->shape_); SparseTensorImpl st1(tensor1); SparseTensorImpl st2(tensor2); - SparseTensorImpl st3(tensor3); - ASSERT_TRUE(st1.Equals(st2)); - ASSERT_TRUE(!st1.Equals(st3)); + ASSERT_TRUE(st1.Equals(*this->sparse_tensor_from_dense_)); + ASSERT_FALSE(st1.Equals(st2)); +} + +template +class TestSparseCOOTensorForIndexValueType + : public TestSparseCOOTensorBase { + protected: + std::shared_ptr MakeSparseCOOIndex( + const std::vector& coords_shape, + const std::vector& coords_strides, + std::vector& coords_values) const { + auto coords_data = Buffer::Wrap(coords_values); + auto coords = std::make_shared>( + coords_data, coords_shape, coords_strides); + return std::make_shared(coords); + } + + template + std::shared_ptr MakeSparseTensor( + const std::shared_ptr& si, + std::vector& sparse_values) const { + auto data = Buffer::Wrap(sparse_values); + return std::make_shared(si, + CTypeTraits::type_singleton(), + data, this->shape_, this->dim_names_); + } +}; + +TYPED_TEST_CASE_P(TestSparseCOOTensorForIndexValueType); + +TYPED_TEST_P(TestSparseCOOTensorForIndexValueType, CreationWithRowMajorIndex) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + std::vector coords_values = {0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 3, + 0, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 3, + 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value * 3, sizeof_index_value}, coords_values); + + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensor(si, sparse_values); + + ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st->dim_names()); + ASSERT_EQ("foo", st->dim_name(0)); + ASSERT_EQ("bar", st->dim_name(1)); + ASSERT_EQ("baz", st->dim_name(2)); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { - std::vector shape = {6, 4}; +TYPED_TEST_P(TestSparseCOOTensorForIndexValueType, CreationWithColumnMajorIndex) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + std::vector coords_values = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, + 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + auto si = this->MakeSparseCOOIndex( + {12, 3}, {sizeof_index_value, sizeof_index_value * 12}, coords_values); + + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st = this->MakeSparseTensor(si, sparse_values); + + ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st->dim_names()); + ASSERT_EQ("foo", st->dim_name(0)); + ASSERT_EQ("bar", st->dim_name(1)); + ASSERT_EQ("baz", st->dim_name(2)); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCOOTensorForIndexValueType, + EqualityBetweenRowAndColumnMajorIndices) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + // Sparse representation: + // idx[0] = [0 0 0 0 0 0 1 1 1 1 1 1] + // idx[1] = [0 0 1 1 2 2 0 0 1 1 2 2] + // idx[2] = [0 2 1 3 0 2 1 3 0 2 1 3] + // data = [1 2 3 4 5 6 11 12 13 14 15 16] + + // Row-major COO index + const std::vector coords_shape = {12, 3}; + const int sizeof_index_value = sizeof(c_index_value_type); + std::vector coords_values_row_major = { + 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 3, 0, 2, 0, 0, 2, 2, + 1, 0, 1, 1, 0, 3, 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 3}; + auto si_row_major = + this->MakeSparseCOOIndex(coords_shape, {sizeof_index_value * 3, sizeof_index_value}, + coords_values_row_major); + + // Column-major COO index + std::vector coords_values_col_major = { + 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 2, + 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}; + auto si_col_major = this->MakeSparseCOOIndex( + coords_shape, {sizeof_index_value, sizeof_index_value * 12}, + coords_values_col_major); + + std::vector sparse_values_1 = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; + auto st1 = this->MakeSparseTensor(si_row_major, sparse_values_1); + + std::vector sparse_values_2 = sparse_values_1; + auto st2 = this->MakeSparseTensor(si_row_major, sparse_values_2); + + ASSERT_TRUE(st2->Equals(*st1)); +} + +REGISTER_TYPED_TEST_CASE_P(TestSparseCOOTensorForIndexValueType, + CreationWithRowMajorIndex, CreationWithColumnMajorIndex, + EqualityBetweenRowAndColumnMajorIndices); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCOOTensorForIndexValueType, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCOOTensorForIndexValueType, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCOOTensorForIndexValueType, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseCOOTensorForIndexValueType, + UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCOOTensorForIndexValueType, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCOOTensorForIndexValueType, + UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCOOTensorForIndexValueType, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCOOTensorForIndexValueType, + UInt64Type); + +template +class TestSparseCSRMatrixBase : public ::testing::Test { + public: + void SetUp() { + shape_ = {6, 4}; + dim_names_ = {"foo", "bar"}; + + // Dense representation: + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); + sparse_tensor_from_dense_ = std::make_shared( + dense_tensor, TypeTraits::type_singleton()); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCSRMatrix : public TestSparseCSRMatrixBase {}; + +TEST_F(TestSparseCSRMatrix, CreationFromNumericTensor2D) { std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::shared_ptr buffer = Buffer::Wrap(values); - std::vector dim_names = {"foo", "bar", "baz"}; - NumericTensor tensor1(buffer, shape); - NumericTensor tensor2(buffer, shape, {}, dim_names); + NumericTensor tensor(buffer, this->shape_); - SparseTensorImpl st1(tensor1); - SparseTensorImpl st2(tensor2); + SparseTensorImpl st1(tensor); + SparseTensorImpl& st2 = *this->sparse_tensor_from_dense_; CheckSparseIndexFormatType(SparseTensorFormat::CSR, st1); ASSERT_EQ(12, st1.non_zero_length()); ASSERT_TRUE(st1.is_mutable()); - ASSERT_EQ(std::vector({"foo", "bar", "baz"}), st2.dim_names()); + ASSERT_EQ(std::vector({"foo", "bar"}), st2.dim_names()); ASSERT_EQ("foo", st2.dim_name(0)); ASSERT_EQ("bar", st2.dim_name(1)); - ASSERT_EQ("baz", st2.dim_name(2)); ASSERT_EQ(std::vector({}), st1.dim_names()); ASSERT_EQ("", st1.dim_name(0)); @@ -252,14 +416,13 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); } -TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { - std::vector shape = {6, 4}; +TEST_F(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { std::vector values = {1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 11, 0, 0, 0, 12, 0, 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 16, 0}; std::vector strides = {64, 16}; std::shared_ptr buffer = Buffer::Wrap(values); - Tensor tensor(int64(), buffer, shape, strides); + Tensor tensor(int64(), buffer, this->shape_, strides); SparseTensorImpl st(tensor); ASSERT_EQ(12, st.non_zero_length()); @@ -286,26 +449,24 @@ TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { ASSERT_EQ(12, indices_values.size()); ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); + + ASSERT_TRUE(st.Equals(*this->sparse_tensor_from_dense_)); } -TEST(TestSparseCSRMatrix, TensorEquality) { - std::vector shape = {6, 4}; +TEST_F(TestSparseCSRMatrix, TensorEquality) { std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::vector values2 = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + std::vector values2 = {9, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; std::shared_ptr buffer1 = Buffer::Wrap(values1); std::shared_ptr buffer2 = Buffer::Wrap(values2); - NumericTensor tensor1(buffer1, shape); - NumericTensor tensor2(buffer1, shape); - NumericTensor tensor3(buffer2, shape); + NumericTensor tensor1(buffer1, this->shape_); + NumericTensor tensor2(buffer2, this->shape_); SparseTensorImpl st1(tensor1); SparseTensorImpl st2(tensor2); - SparseTensorImpl st3(tensor3); - ASSERT_TRUE(st1.Equals(st2)); - ASSERT_TRUE(!st1.Equals(st3)); + ASSERT_TRUE(st1.Equals(*this->sparse_tensor_from_dense_)); + ASSERT_FALSE(st1.Equals(st2)); } } // namespace arrow diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 58ef6f7a50179..95a5c6ec6e120 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -31,39 +31,45 @@ namespace arrow { -#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ - ACTION(Null); \ - ACTION(Boolean); \ - ACTION(Int8); \ - ACTION(UInt8); \ - ACTION(Int16); \ - ACTION(UInt16); \ - ACTION(Int32); \ - ACTION(UInt32); \ - ACTION(Int64); \ - ACTION(UInt64); \ - ACTION(HalfFloat); \ - ACTION(Float); \ - ACTION(Double); \ - ACTION(String); \ - ACTION(Binary); \ - ACTION(LargeString); \ - ACTION(LargeBinary); \ - ACTION(FixedSizeBinary); \ - ACTION(Duration); \ - ACTION(Date32); \ - ACTION(Date64); \ - ACTION(Timestamp); \ - ACTION(Time32); \ - ACTION(Time64); \ - ACTION(Decimal128); \ - ACTION(List); \ - ACTION(LargeList); \ - ACTION(Map); \ - ACTION(FixedSizeList); \ - ACTION(Struct); \ - ACTION(Union); \ - ACTION(Dictionary); \ +#define ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION) \ + ACTION(Int8); \ + ACTION(UInt8); \ + ACTION(Int16); \ + ACTION(UInt16); \ + ACTION(Int32); \ + ACTION(UInt32); \ + ACTION(Int64); \ + ACTION(UInt64) + +#define ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION) \ + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION); \ + ACTION(HalfFloat); \ + ACTION(Float); \ + ACTION(Double) + +#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ + ACTION(Null); \ + ACTION(Boolean); \ + ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ + ACTION(String); \ + ACTION(Binary); \ + ACTION(LargeString); \ + ACTION(LargeBinary); \ + ACTION(FixedSizeBinary); \ + ACTION(Duration); \ + ACTION(Date32); \ + ACTION(Date64); \ + ACTION(Timestamp); \ + ACTION(Time32); \ + ACTION(Time64); \ + ACTION(Decimal128); \ + ACTION(List); \ + ACTION(LargeList); \ + ACTION(Map); \ + ACTION(FixedSizeList); \ + ACTION(Struct); \ + ACTION(Union); \ + ACTION(Dictionary); \ ACTION(Extension) #define TYPE_VISIT_INLINE(TYPE_CLASS) \ diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 853dd1985f930..96d954d1edf70 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -25,36 +25,50 @@ namespace org.apache.arrow.flatbuf; /// ---------------------------------------------------------------------- /// EXPERIMENTAL: Data structures for sparse tensors -/// Coodinate format of sparse tensor index. +/// Coodinate (COO) format of sparse tensor index. +/// +/// COO's index list are represented as a NxM matrix, +/// where N is the number of non-zero values, +/// and M is the number of dimensions of a sparse tensor. +/// +/// indicesBuffer stores the location and size of the data of this indices +/// matrix. The value type and the stride of the indices matrix is +/// specified in indicesType and indicesStrides fields. +/// +/// For example, let X be a 2x3x4x5 tensor, and it has the following +/// 6 non-zero values: +/// +/// X[0, 1, 2, 0] := 1 +/// X[1, 1, 2, 3] := 2 +/// X[0, 2, 1, 0] := 3 +/// X[0, 1, 3, 0] := 4 +/// X[0, 1, 2, 1] := 5 +/// X[1, 2, 0, 4] := 6 +/// +/// In COO format, the index matrix of X is the following 4x6 matrix: +/// +/// [[0, 0, 0, 0, 1, 1], +/// [1, 1, 1, 2, 1, 2], +/// [2, 2, 3, 1, 2, 0], +/// [0, 1, 0, 0, 3, 4]] +/// +/// Note that the indices are sorted in lexicographical order. table SparseTensorIndexCOO { - /// COO's index list are represented as a NxM matrix, - /// where N is the number of non-zero values, - /// and M is the number of dimensions of a sparse tensor. - /// indicesBuffer stores the location and size of this index matrix. - /// The type of index value is long, so the stride for the index matrix is unnecessary. - /// - /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: - /// - /// X[0, 1, 2, 0] := 1 - /// X[1, 1, 2, 3] := 2 - /// X[0, 2, 1, 0] := 3 - /// X[0, 1, 3, 0] := 4 - /// X[0, 1, 2, 1] := 5 - /// X[1, 2, 0, 4] := 6 - /// - /// In COO format, the index matrix of X is the following 4x6 matrix: - /// - /// [[0, 0, 0, 0, 1, 1], - /// [1, 1, 1, 2, 1, 2], - /// [2, 2, 3, 1, 2, 0], - /// [0, 1, 0, 0, 3, 4]] - /// - /// Note that the indices are sorted in lexicographical order. + /// The type of values in indicesBuffer + indicesType: Int; + + /// Non-negative byte offsets to advance one value cell along each dimension + indicesStrides: [long]; + + /// The location and size of the indices matrix's data indicesBuffer: Buffer; } /// Compressed Sparse Row format, that is matrix-specific. table SparseMatrixIndexCSR { + /// The type of values in indptrBuffer + indptrType: Int; + /// indptrBuffer stores the location and size of indptr array that /// represents the range of the rows. /// The i-th row spans from indptr[i] to indptr[i+1] in the data. @@ -79,6 +93,9 @@ table SparseMatrixIndexCSR { /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. indptrBuffer: Buffer; + /// The type of values in indicesBuffer + indicesType: Int; + /// indicesBuffer stores the location and size of the array that /// contains the column indices of the corresponding non-zero values. /// The type of index value is long.