diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index fccd53c5bc0b4..352b2de3feac9 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -58,6 +58,7 @@ set(FBS_SRC ${ARROW_SOURCE_DIR}/../format/File.fbs ${ARROW_SOURCE_DIR}/../format/Schema.fbs ${ARROW_SOURCE_DIR}/../format/Tensor.fbs + ${ARROW_SOURCE_DIR}/../format/SparseTensor.fbs ${CMAKE_CURRENT_SOURCE_DIR}/feather.fbs) foreach(FIL ${FBS_SRC}) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index da6711395f8ea..38d8eaa85943f 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -28,7 +28,8 @@ #include "arrow/io/interfaces.h" #include "arrow/ipc/File_generated.h" // IWYU pragma: keep #include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep +#include "arrow/ipc/SparseTensor_generated.h" // IWYU pragma: keep +#include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep #include "arrow/ipc/message.h" #include "arrow/ipc/util.h" #include "arrow/sparse_tensor.h" diff --git a/docs/source/format/README.rst b/docs/source/format/README.rst index f2f770bdc95c1..4044026a9460b 100644 --- a/docs/source/format/README.rst +++ b/docs/source/format/README.rst @@ -25,7 +25,7 @@ Currently, the Arrow specification consists of these pieces: - Logical Types, Schemas, and Record Batch Metadata (see Schema.fbs) - Encapsulated Messages (see Message.fbs) - Mechanics of messaging between Arrow systems (IPC, RPC, etc.) (see :doc:`IPC`) -- Tensor (Multi-dimensional array) Metadata (see Tensor.fbs) +- Tensor (Multi-dimensional array) Metadata (see Tensor.fbs and SparseTensor.fbs) The metadata currently uses Google's `flatbuffers library`_ for serializing a couple related pieces of information: diff --git a/format/Message.fbs b/format/Message.fbs index e14fdca8f155c..10adaaa5edae8 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -16,6 +16,7 @@ // under the License. include "Schema.fbs"; +include "SparseTensor.fbs"; include "Tensor.fbs"; namespace org.apache.arrow.flatbuf; diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs new file mode 100644 index 0000000000000..0a0c6c25e41e6 --- /dev/null +++ b/format/SparseTensor.fbs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors". +/// Arrow implementations in general are not required to implement this type + +include "Tensor.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// EXPERIMENTAL: Data structures for sparse tensors + +/// Coodinate format of sparse tensor index. +table SparseTensorIndexCOO { + /// COO's index list are represented as a NxM matrix, + /// where N is the number of non-zero values, + /// and M is the number of dimensions of a sparse tensor. + /// indicesBuffer stores the location and size of this index matrix. + /// The type of index value is long, so the stride for the index matrix is unnecessary. + /// + /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: + /// + /// X[0, 1, 2, 0] := 1 + /// X[1, 1, 2, 3] := 2 + /// X[0, 2, 1, 0] := 3 + /// X[0, 1, 3, 0] := 4 + /// X[0, 1, 2, 1] := 5 + /// X[1, 2, 0, 4] := 6 + /// + /// In COO format, the index matrix of X is the following 4x6 matrix: + /// + /// [[0, 0, 0, 0, 1, 1], + /// [1, 1, 1, 2, 1, 2], + /// [2, 2, 3, 1, 2, 0], + /// [0, 1, 0, 0, 3, 4]] + /// + /// Note that the indices are sorted in lexcographical order. + indicesBuffer: Buffer; +} + +/// Compressed Sparse Row format, that is matrix-specific. +table SparseMatrixIndexCSR { + /// indptrBuffer stores the location and size of indptr array that + /// represents the range of the rows. + /// The i-th row spans from indptr[i] to indptr[i+1] in the data. + /// The length of this array is 1 + (the number of rows), and the type + /// of index value is long. + /// + /// For example, let X be the following 6x4 matrix: + /// + /// X := [[0, 1, 2, 0], + /// [0, 0, 3, 0], + /// [0, 4, 0, 5], + /// [0, 0, 0, 0], + /// [6, 0, 7, 8], + /// [0, 9, 0, 0]]. + /// + /// The array of non-zero values in X is: + /// + /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. + /// + /// And the indptr of X is: + /// + /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. + indptrBuffer: Buffer; + + /// indicesBuffer stores the location and size of the array that + /// contains the column indices of the corresponding non-zero values. + /// The type of index value is long. + /// + /// For example, the indices of the above X is: + /// + /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. + indicesBuffer: Buffer; +} + +union SparseTensorIndex { + SparseTensorIndexCOO, + SparseMatrixIndexCSR +} + +table SparseTensor { + /// The type of data contained in a value cell. + /// Currently only fixed-width value types are supported, + /// no strings or nested types. + type: Type; + + /// The dimensions of the tensor, optionally named. + shape: [TensorDim]; + + /// The number of non-zero values in a sparse tensor. + non_zero_length: long; + + /// Sparse tensor index + sparseIndex: SparseTensorIndex; + + /// The location and size of the tensor's data + data: Buffer; +} + +root_type SparseTensor; diff --git a/format/Tensor.fbs b/format/Tensor.fbs index e77b353a0f33f..01a20c3b1f1fd 100644 --- a/format/Tensor.fbs +++ b/format/Tensor.fbs @@ -51,96 +51,3 @@ table Tensor { } root_type Tensor; - -/// ---------------------------------------------------------------------- -/// EXPERIMENTAL: Data structures for sparse tensors - -/// Coodinate format of sparse tensor index. -table SparseTensorIndexCOO { - /// COO's index list are represented as a NxM matrix, - /// where N is the number of non-zero values, - /// and M is the number of dimensions of a sparse tensor. - /// indicesBuffer stores the location and size of this index matrix. - /// The type of index value is long, so the stride for the index matrix is unnecessary. - /// - /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: - /// - /// X[0, 1, 2, 0] := 1 - /// X[1, 1, 2, 3] := 2 - /// X[0, 2, 1, 0] := 3 - /// X[0, 1, 3, 0] := 4 - /// X[0, 1, 2, 1] := 5 - /// X[1, 2, 0, 4] := 6 - /// - /// In COO format, the index matrix of X is the following 4x6 matrix: - /// - /// [[0, 0, 0, 0, 1, 1], - /// [1, 1, 1, 2, 1, 2], - /// [2, 2, 3, 1, 2, 0], - /// [0, 1, 0, 0, 3, 4]] - /// - /// Note that the indices are sorted in lexcographical order. - indicesBuffer: Buffer; -} - -/// Compressed Sparse Row format, that is matrix-specific. -table SparseMatrixIndexCSR { - /// indptrBuffer stores the location and size of indptr array that - /// represents the range of the rows. - /// The i-th row spans from indptr[i] to indptr[i+1] in the data. - /// The length of this array is 1 + (the number of rows), and the type - /// of index value is long. - /// - /// For example, let X be the following 6x4 matrix: - /// - /// X := [[0, 1, 2, 0], - /// [0, 0, 3, 0], - /// [0, 4, 0, 5], - /// [0, 0, 0, 0], - /// [6, 0, 7, 8], - /// [0, 9, 0, 0]]. - /// - /// The array of non-zero values in X is: - /// - /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. - /// - /// And the indptr of X is: - /// - /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. - indptrBuffer: Buffer; - - /// indicesBuffer stores the location and size of the array that - /// contains the column indices of the corresponding non-zero values. - /// The type of index value is long. - /// - /// For example, the indices of the above X is: - /// - /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. - indicesBuffer: Buffer; -} - -union SparseTensorIndex { - SparseTensorIndexCOO, - SparseMatrixIndexCSR -} - -table SparseTensor { - /// The type of data contained in a value cell. - /// Currently only fixed-width value types are supported, - /// no strings or nested types. - type: Type; - - /// The dimensions of the tensor, optionally named. - shape: [TensorDim]; - - /// The number of non-zero values in a sparse tensor. - non_zero_length: long; - - /// Sparse tensor index - sparseIndex: SparseTensorIndex; - - /// The location and size of the tensor's data - data: Buffer; -} - -root_type SparseTensor; diff --git a/java/format/pom.xml b/java/format/pom.xml index 2c3dc03acab1d..5525cd3e43e2b 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -106,6 +106,7 @@ ${flatc.generated.files} ../../format/Schema.fbs ../../format/Tensor.fbs + ../../format/SparseTensor.fbs ../../format/File.fbs ../../format/Message.fbs