Skip to content

Commit

Permalink
Add append_many to dictionary arrays to allow adding repeated values (a…
Browse files Browse the repository at this point in the history
…pache#6534)

* Add append_many to dictionary arrays to allow adding repeated values

* fix merge

* rename func
  • Loading branch information
adriangb authored Oct 10, 2024
1 parent 44b6ded commit 4adbeee
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 20 deletions.
68 changes: 68 additions & 0 deletions arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,74 @@ mod tests {
assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4]));
}

#[test]
fn test_dictionary_builder_append_many() {
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();

builder.append(1).unwrap();
builder.append_n(2, 2).unwrap();
builder.append_options(None, 2);
builder.append_options(Some(3), 3);

let array = builder.finish();

let values = array
.values()
.as_primitive::<UInt32Type>()
.iter()
.map(Option::unwrap)
.collect::<Vec<_>>();
assert_eq!(values, &[1, 2, 3]);
let keys = array.keys().iter().collect::<Vec<_>>();
assert_eq!(
keys,
&[
Some(0),
Some(1),
Some(1),
None,
None,
Some(2),
Some(2),
Some(2)
]
);
}

#[test]
fn test_string_dictionary_builder_append_many() {
let mut builder = StringDictionaryBuilder::<Int8Type>::new();

builder.append("a").unwrap();
builder.append_n("b", 2).unwrap();
builder.append_options(None::<&str>, 2);
builder.append_options(Some("c"), 3);

let array = builder.finish();

let values = array
.values()
.as_string::<i32>()
.iter()
.map(Option::unwrap)
.collect::<Vec<_>>();
assert_eq!(values, &["a", "b", "c"]);
let keys = array.keys().iter().collect::<Vec<_>>();
assert_eq!(
keys,
&[
Some(0),
Some(1),
Some(1),
None,
None,
Some(2),
Some(2),
Some(2)
]
);
}

#[test]
fn test_dictionary_array_fmt_debug() {
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
Expand Down
58 changes: 50 additions & 8 deletions arrow-array/src/builder/generic_bytes_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,7 @@ where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Append a value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
///
/// Returns an error if the new index would overflow the key type.
pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
fn get_or_insert_key(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
let value_native: &T::Native = value.as_ref();
let value_bytes: &[u8] = value_native.as_ref();

Expand All @@ -223,8 +218,32 @@ where
.get();

let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;

Ok(key)
}

/// Append a value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
///
/// Returns an error if the new index would overflow the key type.
pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value(key);
Ok(key)
}

/// Append a value multiple times to the array.
/// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
///
/// Returns an error if the new index would overflow the key type.
pub fn append_n(
&mut self,
value: impl AsRef<T::Native>,
count: usize,
) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value_n(key, count);
Ok(key)
}

Expand All @@ -237,6 +256,17 @@ where
self.append(value).expect("dictionary key overflow");
}

/// Infallibly append a value to this builder repeatedly `count` times.
/// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_values(&mut self, value: impl AsRef<T::Native>, count: usize) {
self.append_n(value, count)
.expect("dictionary key overflow");
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
Expand All @@ -256,6 +286,19 @@ where
};
}

/// Append an `Option` value into the builder repeatedly `count` times.
/// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_options(&mut self, value: Option<impl AsRef<T::Native>>, count: usize) {
match value {
None => self.keys_builder.append_nulls(count),
Some(v) => self.append_values(v, count),
};
}

/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
Expand Down Expand Up @@ -331,8 +374,7 @@ fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
/// builder.append_null();
/// builder.append("def").unwrap();
/// builder.append("def").unwrap();
/// builder.append_n("def", 2).unwrap(); // appends "def" twice with a single lookup
/// builder.append("abc").unwrap();
/// let array = builder.finish();
///
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/builder/primitive_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,13 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
self.values_builder.append(v);
}

/// Appends a value of type `T` into the builder `n` times
#[inline]
pub fn append_value_n(&mut self, v: T::Native, n: usize) {
self.null_buffer_builder.append_n_non_nulls(n);
self.values_builder.append_n(n, v);
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
Expand Down
62 changes: 50 additions & 12 deletions arrow-array/src/builder/primitive_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
use arrow_buffer::{ArrowNativeType, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;

Expand Down Expand Up @@ -210,26 +209,41 @@ where
K: ArrowDictionaryKeyType,
V: ArrowPrimitiveType,
{
/// Append a primitive value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
#[inline]
pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
let key = match self.map.entry(Value(value)) {
Entry::Vacant(vacant) => {
// Append new value.
fn get_or_insert_key(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
match self.map.get(&Value(value)) {
Some(&key) => {
Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
}
None => {
let key = self.values_builder.len();
self.values_builder.append_value(value);
vacant.insert(key);
K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?
self.map.insert(Value(value), key);
Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
}
Entry::Occupied(o) => K::Native::usize_as(*o.get()),
};
}
}

/// Append a primitive value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
#[inline]
pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value(key);
Ok(key)
}

/// Append a value multiple times to the array.
/// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
///
/// Returns an error if the new index would overflow the key type.
pub fn append_n(&mut self, value: V::Native, count: usize) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value_n(key, count);
Ok(key)
}

/// Infallibly append a value to this builder
///
/// # Panics
Expand All @@ -240,6 +254,17 @@ where
self.append(value).expect("dictionary key overflow");
}

/// Infallibly append a value to this builder repeatedly `count` times.
/// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_values(&mut self, value: V::Native, count: usize) {
self.append_n(value, count)
.expect("dictionary key overflow");
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
Expand All @@ -259,6 +284,19 @@ where
};
}

/// Append an `Option` value into the builder repeatedly `count` times.
/// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_options(&mut self, value: Option<V::Native>, count: usize) {
match value {
None => self.keys_builder.append_nulls(count),
Some(v) => self.append_values(v, count),
};
}

/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.map.clear();
Expand Down

0 comments on commit 4adbeee

Please sign in to comment.