From 350ea26c6cc646baefb72e39e138d10f9261f71e Mon Sep 17 00:00:00 2001 From: Austin Liu Date: Wed, 6 Nov 2024 00:13:19 +0800 Subject: [PATCH] Support `Utf8View` for `bit_length` kernel (#6671) * Support `Utf8View` for string function `bit_length()` Signed-off-by: Austin Liu * Add test & handle view bytes length counting Signed-off-by: Austin Liu Add test & handle view bytes length counting Signed-off-by: Austin Liu * Refine `string_view_array` Signed-off-by: Austin Liu * Make length from `i32` to `u32` & check nullity Signed-off-by: Austin Liu * Clean up Signed-off-by: Austin Liu * Refine Signed-off-by: Austin Liu * Use `from_unary` instead Signed-off-by: Austin Liu * Prevent inspect the string data Signed-off-by: Austin Liu * Clean up Signed-off-by: Austin Liu --------- Signed-off-by: Austin Liu --- arrow-string/src/length.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 97f876a9f953..6a28d44ea7aa 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -137,6 +137,15 @@ pub fn bit_length(array: &dyn Array) -> Result { let list = array.as_string::(); Ok(bit_length_impl::(list.offsets(), list.nulls())) } + DataType::Utf8View => { + let list = array.as_string_view(); + let values = list + .views() + .iter() + .map(|view| (*view as i32).wrapping_mul(8)) + .collect(); + Ok(Arc::new(Int32Array::new(values, array.nulls().cloned()))) + } DataType::Binary => { let list = array.as_binary::(); Ok(bit_length_impl::(list.offsets(), list.nulls())) @@ -462,6 +471,35 @@ mod tests { }) } + #[test] + fn bit_length_test_utf8view() { + bit_length_cases() + .into_iter() + .for_each(|(input, len, expected)| { + let string_array = StringViewArray::from(input); + let result = bit_length(&string_array).unwrap(); + assert_eq!(len, result.len()); + let result = result.as_any().downcast_ref::().unwrap(); + expected.iter().enumerate().for_each(|(i, value)| { + assert_eq!(*value, result.value(i)); + }); + }) + } + + #[test] + fn bit_length_null_utf8view() { + bit_length_null_cases() + .into_iter() + .for_each(|(input, len, expected)| { + let array = StringArray::from(input); + let result = bit_length(&array).unwrap(); + assert_eq!(len, result.len()); + let result = result.as_any().downcast_ref::().unwrap(); + + let expected: Int32Array = expected.into(); + assert_eq!(&expected, result); + }) + } #[test] fn bit_length_binary() { let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"];