forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ARROW-6078: [Java] Implement dictionary-encoded subfields for List type
Related to [ARROW-6078](https://issues.apache.org/jira/browse/ARROW-6078). For example, int type List (valueCount = 5) has data like below: 10, 20 10, 20 30, 40, 50 30, 40, 50 10, 20 could be encoded to: 0, 1 0, 1 2, 3, 4 2, 3, 4 0, 1 with list type dictionary 10, 20, 30, 40, 50 or 10, 20, 30, 40, 50 Closes apache#4972 from tianchen92/ARROW-1175 and squashes the following commits: 5d2f751 <tianchen92> Update java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java fbd122b <tianchen> fix c51ec00 <tianchen> add replaceDataVector in BaseListVector 658958b <tianchen> make BaseListVector extend FieldVector 6c9d95d <tianchen> refactor BaseListVector 0b6cec5 <tianchen> resolve conflict a54ecd1 <tianchen> ARROW-6078: Implement dictionary-encoded subfields for List type Lead-authored-by: tianchen <[email protected]> Co-authored-by: tianchen92 <[email protected]> Signed-off-by: Micah Kornfield <[email protected]>
- Loading branch information
1 parent
beea8f9
commit 157b179
Showing
7 changed files
with
387 additions
and
29 deletions.
There are no files selected for viewing
36 changes: 36 additions & 0 deletions
36
java/vector/src/main/java/org/apache/arrow/vector/complex/BaseListVector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.arrow.vector.complex; | ||
|
||
import org.apache.arrow.vector.FieldVector; | ||
|
||
/** | ||
* Abstraction for all list type vectors. | ||
*/ | ||
public interface BaseListVector extends FieldVector { | ||
|
||
/** | ||
* Get data vector start index with the given list index. | ||
*/ | ||
int getElementStartIndex(int index); | ||
|
||
/** | ||
* Get data vector end index with the given list index. | ||
*/ | ||
int getElementEndIndex(int index); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
131 changes: 131 additions & 0 deletions
131
java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.arrow.vector.dictionary; | ||
|
||
import java.util.Collections; | ||
|
||
import org.apache.arrow.memory.BufferAllocator; | ||
import org.apache.arrow.vector.BaseIntVector; | ||
import org.apache.arrow.vector.FieldVector; | ||
import org.apache.arrow.vector.ValueVector; | ||
import org.apache.arrow.vector.complex.BaseListVector; | ||
import org.apache.arrow.vector.ipc.message.ArrowFieldNode; | ||
import org.apache.arrow.vector.types.pojo.Field; | ||
import org.apache.arrow.vector.types.pojo.FieldType; | ||
import org.apache.arrow.vector.util.TransferPair; | ||
|
||
/** | ||
* Sub fields encoder/decoder for Dictionary encoded {@link BaseListVector}. | ||
*/ | ||
public class ListSubfieldEncoder { | ||
|
||
private final DictionaryHashTable hashTable; | ||
private final Dictionary dictionary; | ||
private final BufferAllocator allocator; | ||
|
||
/** | ||
* Construct an instance. | ||
*/ | ||
public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator) { | ||
this.dictionary = dictionary; | ||
this.allocator = allocator; | ||
BaseListVector dictVector = (BaseListVector) dictionary.getVector(); | ||
hashTable = new DictionaryHashTable(getDataVector(dictVector)); | ||
} | ||
|
||
private FieldVector getDataVector(BaseListVector vector) { | ||
return vector.getChildrenFromFields().get(0); | ||
} | ||
|
||
private BaseListVector cloneVector(BaseListVector vector) { | ||
|
||
final FieldType fieldType = vector.getField().getFieldType(); | ||
BaseListVector cloned = (BaseListVector) fieldType.createNewSingleVector(vector.getField().getName(), | ||
allocator, /*schemaCallBack=*/null); | ||
|
||
final ArrowFieldNode fieldNode = new ArrowFieldNode(vector.getValueCount(), vector.getNullCount()); | ||
cloned.loadFieldBuffers(fieldNode, vector.getFieldBuffers()); | ||
|
||
return cloned; | ||
} | ||
|
||
/** | ||
* Dictionary encodes subfields for complex vector with a provided dictionary. | ||
* The dictionary must contain all values in the sub fields vector. | ||
* @param vector vector to encode | ||
* @return dictionary encoded vector | ||
*/ | ||
public BaseListVector encodeListSubField(BaseListVector vector) { | ||
final int valueCount = vector.getValueCount(); | ||
|
||
FieldType indexFieldType = new FieldType(vector.getField().isNullable(), | ||
dictionary.getEncoding().getIndexType(), dictionary.getEncoding(), vector.getField().getMetadata()); | ||
Field valueField = new Field(vector.getField().getName(), indexFieldType,null); | ||
|
||
// clone list vector and initialize data vector | ||
BaseListVector encoded = cloneVector(vector); | ||
encoded.initializeChildrenFromFields(Collections.singletonList(valueField)); | ||
BaseIntVector indices = (BaseIntVector) getDataVector(encoded); | ||
|
||
ValueVector dataVector = getDataVector(vector); | ||
for (int i = 0; i < valueCount; i++) { | ||
if (!vector.isNull(i)) { | ||
int start = vector.getElementStartIndex(i); | ||
int end = vector.getElementEndIndex(i); | ||
|
||
DictionaryEncoder.buildIndexVector(dataVector, indices, hashTable, start, end); | ||
} | ||
} | ||
|
||
return encoded; | ||
} | ||
|
||
/** | ||
* Decodes a dictionary subfields encoded vector using the provided dictionary. | ||
* @param vector dictionary encoded vector, its data vector must be int type | ||
* @return vector with values restored from dictionary | ||
*/ | ||
public BaseListVector decodeListSubField(BaseListVector vector) { | ||
|
||
int valueCount = vector.getValueCount(); | ||
BaseListVector dictionaryVector = (BaseListVector) dictionary.getVector(); | ||
int dictionaryValueCount = getDataVector(dictionaryVector).getValueCount(); | ||
|
||
// clone list vector and initialize data vector | ||
BaseListVector decoded = cloneVector(vector); | ||
Field dataVectorField = getDataVector(dictionaryVector).getField(); | ||
decoded.initializeChildrenFromFields(Collections.singletonList(dataVectorField)); | ||
|
||
// get data vector | ||
ValueVector dataVector = getDataVector(decoded); | ||
|
||
TransferPair transfer = getDataVector(dictionaryVector).makeTransferPair(dataVector); | ||
BaseIntVector indices = (BaseIntVector) getDataVector(vector); | ||
|
||
for (int i = 0; i < valueCount; i++) { | ||
|
||
if (!vector.isNull(i)) { | ||
int start = vector.getElementStartIndex(i); | ||
int end = vector.getElementEndIndex(i); | ||
|
||
DictionaryEncoder.retrieveIndexVector(indices, transfer, dictionaryValueCount, start, end); | ||
} | ||
} | ||
return decoded; | ||
} | ||
} |
Oops, something went wrong.