diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 2ea007c485c2e..e2a8c5499dc95 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -986,6 +986,11 @@ a ``RecordBatch`` it should be defined in a ``DictionaryBatch``. :: +.. note:: An edge-case for interleaved dictionary and record batches occurs + when the record batches contain dictionary encoded arrays that are + completely null. In this case, the dictionary for the encoded column might + appear after the first record batch. + When a stream reader implementation is reading a stream, after each message, it may read the next 8 bytes to determine both if the stream continues and the size of the message metadata that follows. Once the @@ -1019,7 +1024,10 @@ Schematically we have: :: In the file format, there is no requirement that dictionary keys should be defined in a ``DictionaryBatch`` before they are used in a ``RecordBatch``, as long as the keys are defined somewhere in the -file. +file. Further more, it is invalid to have more then one **non-delta** +dictionary batch per dictionary ID (i.e. dictionary replacement is not +supported). Delta dictionaries are applied in the order they appear in +the file footer. Dictionary Messages ------------------- @@ -1073,6 +1081,37 @@ form: :: 0 EOS +Alternatively, if ``isDelta`` is set to false, then the dictionary +replaces the existing dictionary for the same ID. Using the same +example as above, an alternate encoding could be: :: + + + + + (0) "A" + (1) "B" + (2) "C" + + + 0 + 1 + 2 + 1 + + + (0) "A" + (1) "C" + (2) "D" + (3) "E" + + + 2 + 1 + 3 + 0 + EOS + + Custom Application Metadata --------------------------- diff --git a/format/Message.fbs b/format/Message.fbs index 2b3a12064b35a..7e8e6779aa5f4 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -74,7 +74,8 @@ table DictionaryBatch { data: RecordBatch; /// If isDelta is true the values in the dictionary are to be appended to a - /// dictionary with the indicated id + /// dictionary with the indicated id. If isDelta is false this dictionary + /// should replace the existing dictionary. isDelta: bool = false; } diff --git a/format/Schema.fbs b/format/Schema.fbs index 1e39e8f32ac1d..63a80b0676790 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -267,7 +267,10 @@ table KeyValue { /// ---------------------------------------------------------------------- /// Dictionary encoding metadata - +/// Maintained for forwards compatibility, in the future +/// Dictionaries might be explicit maps between integers and values +/// allowing for non-contiguous index values +enum DictionaryKind : short { DenseArray } table DictionaryEncoding { /// The known dictionary id in the application where this data is used. In /// the file or streaming formats, the dictionary ids are found in the @@ -283,6 +286,8 @@ table DictionaryEncoding { /// is used to represent ordered categorical data, and we provide a way to /// preserve that metadata here isOrdered: bool; + + dictionaryKind: DictionaryKind; } /// ----------------------------------------------------------------------