From 0ddc1f4737c35008cd06be1ee28472ebd7da68e2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 1 Dec 2019 20:23:18 -0500 Subject: [PATCH] ARROW-6473: Dictionary encoding format clarifications/future proofing This needs to be discussed first on the mailing list. It is a consolidation of recent dictionary encoding threads: [1](https://lists.apache.org/thread.html/9734b71bc12aca16eb997388e95105bff412fdaefa4e19422f477389@%3Cdev.arrow.apache.org%3E), [2](https://lists.apache.org/thread.html/5c3c9346101df8d758e24664638e8ada0211d310ab756a89cde3786a@%3Cdev.arrow.apache.org%3E) and [3](https://lists.apache.org/thread.html/15a4810589b2eb772bce5b2372970d9d93badbd28999a1bbe2af418a@%3Cdev.arrow.apache.org%3E) Closes #5585 from emkornfield/dict_document and squashes the following commits: ee8cbfd7a update to latest submodule d1a080415 be explicit about dictionary replacement 52137823e remove duplicate the 65c709d62 Revert "remove duplicate the" 7c1f1717f address feedback e58e5df30 revert testing 720a05e8a Update based on review. 3d65c7549 remove duplicate the 509f2d0bc undo related change. 2f0724ce3 Proposal Lead-authored-by: Micah Kornfield Co-authored-by: emkornfield Signed-off-by: Micah Kornfield --- docs/source/format/Columnar.rst | 41 ++++++++++++++++++++++++++++++++- format/Message.fbs | 3 ++- format/Schema.fbs | 7 +++++- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 2ea007c485c2e..e2a8c5499dc95 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -986,6 +986,11 @@ a ``RecordBatch`` it should be defined in a ``DictionaryBatch``. :: +.. note:: An edge-case for interleaved dictionary and record batches occurs + when the record batches contain dictionary encoded arrays that are + completely null. In this case, the dictionary for the encoded column might + appear after the first record batch. + When a stream reader implementation is reading a stream, after each message, it may read the next 8 bytes to determine both if the stream continues and the size of the message metadata that follows. Once the @@ -1019,7 +1024,10 @@ Schematically we have: :: In the file format, there is no requirement that dictionary keys should be defined in a ``DictionaryBatch`` before they are used in a ``RecordBatch``, as long as the keys are defined somewhere in the -file. +file. Further more, it is invalid to have more then one **non-delta** +dictionary batch per dictionary ID (i.e. dictionary replacement is not +supported). Delta dictionaries are applied in the order they appear in +the file footer. Dictionary Messages ------------------- @@ -1073,6 +1081,37 @@ form: :: 0 EOS +Alternatively, if ``isDelta`` is set to false, then the dictionary +replaces the existing dictionary for the same ID. Using the same +example as above, an alternate encoding could be: :: + + + + + (0) "A" + (1) "B" + (2) "C" + + + 0 + 1 + 2 + 1 + + + (0) "A" + (1) "C" + (2) "D" + (3) "E" + + + 2 + 1 + 3 + 0 + EOS + + Custom Application Metadata --------------------------- diff --git a/format/Message.fbs b/format/Message.fbs index 2b3a12064b35a..7e8e6779aa5f4 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -74,7 +74,8 @@ table DictionaryBatch { data: RecordBatch; /// If isDelta is true the values in the dictionary are to be appended to a - /// dictionary with the indicated id + /// dictionary with the indicated id. If isDelta is false this dictionary + /// should replace the existing dictionary. isDelta: bool = false; } diff --git a/format/Schema.fbs b/format/Schema.fbs index 1e39e8f32ac1d..63a80b0676790 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -267,7 +267,10 @@ table KeyValue { /// ---------------------------------------------------------------------- /// Dictionary encoding metadata - +/// Maintained for forwards compatibility, in the future +/// Dictionaries might be explicit maps between integers and values +/// allowing for non-contiguous index values +enum DictionaryKind : short { DenseArray } table DictionaryEncoding { /// The known dictionary id in the application where this data is used. In /// the file or streaming formats, the dictionary ids are found in the @@ -283,6 +286,8 @@ table DictionaryEncoding { /// is used to represent ordered categorical data, and we provide a way to /// preserve that metadata here isOrdered: bool; + + dictionaryKind: DictionaryKind; } /// ----------------------------------------------------------------------