diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index f351c25bd3ab..aa66998af8d4 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -937,7 +937,7 @@ mod tests { }; use arrow_array::*; use arrow_buffer::{i256, ArrowNativeType, Buffer, IntervalDayTime}; - use arrow_data::ArrayDataBuilder; + use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ ArrowError, DataType as ArrowDataType, Field, Fields, Schema, SchemaRef, TimeUnit, }; @@ -4067,6 +4067,56 @@ mod tests { } } + #[test] + fn test_read_old_nested_list() { + use arrow::datatypes::DataType; + use arrow::datatypes::ToByteSlice; + + let testdata = arrow::util::test_util::parquet_test_data(); + // message my_record { + // REQUIRED group a (LIST) { + // REPEATED group array (LIST) { + // REPEATED INT32 array; + // } + // } + // } + // should be read as list> + let path = format!("{testdata}/old_list_structure.parquet"); + let test_file = File::open(path).unwrap(); + + // create expected ListArray + let a_values = Int32Array::from(vec![1, 2, 3, 4]); + + // Construct a buffer for value offsets, for the nested array: [[1, 2], [3, 4]] + let a_value_offsets = arrow::buffer::Buffer::from([0, 2, 4].to_byte_slice()); + + // Construct a list array from the above two + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( + "array", + DataType::Int32, + false, + )))) + .len(2) + .add_buffer(a_value_offsets) + .add_child_data(a_values.into_data()) + .build() + .unwrap(); + let a = ListArray::from(a_list_data); + + let builder = ParquetRecordBatchReaderBuilder::try_new(test_file).unwrap(); + let mut reader = builder.build().unwrap(); + let out = reader.next().unwrap().unwrap(); + assert_eq!(out.num_rows(), 1); + assert_eq!(out.num_columns(), 1); + // grab first column + let c0 = out.column(0); + let c0arr = c0.as_any().downcast_ref::().unwrap(); + // get first row: [[1, 2], [3, 4]] + let r0 = c0arr.value(0); + let r0arr = r0.as_any().downcast_ref::().unwrap(); + assert_eq!(r0arr, &a); + } + #[test] fn test_map_no_value() { // File schema: diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index dbc81d1ee195..d2f5a0bd8d63 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -453,15 +453,21 @@ impl Visitor { }; } + // test to see if the repeated field is a struct or one-tuple let items = repeated_field.get_fields(); if items.len() != 1 - || repeated_field.name() == "array" - || repeated_field.name() == format!("{}_tuple", list_type.name()) + || (!repeated_field.is_list() + && !repeated_field.has_single_repeated_child() + && (repeated_field.name() == "array" + || repeated_field.name() == format!("{}_tuple", list_type.name()))) { - // If the repeated field is a group with multiple fields, then its type is the element type and elements are required. + // If the repeated field is a group with multiple fields, then its type is the element + // type and elements are required. // - // If the repeated field is a group with one field and is named either array or uses the LIST-annotated group's name - // with _tuple appended then the repeated type is the element type and elements are required. + // If the repeated field is a group with one field and is named either array or uses + // the LIST-annotated group's name with _tuple appended then the repeated type is the + // element type and elements are required. But this rule only applies if the + // repeated field is not annotated, and the single child field is not `repeated`. let context = VisitorContext { rep_level: context.rep_level, def_level, diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 5e8dc9714875..9e70f7a980db 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -359,6 +359,19 @@ impl Reader { /// /// #backward-compatibility-rules fn is_element_type(repeated_type: &Type) -> bool { + // For legacy 2-level list types whose element type is a 2-level list + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group array (LIST) { + // repeated int32 array; + // }; + // } + // + if repeated_type.is_list() || repeated_type.has_single_repeated_child() { + return false; + } + // For legacy 2-level list types with primitive element type, e.g.: // // // ARRAY (nullable list, non-null elements) @@ -1882,4 +1895,21 @@ mod tests { let iter = row_group_reader.get_row_iter(schema)?; Ok(iter.map(|row| row.unwrap()).collect()) } + + #[test] + fn test_read_old_nested_list() { + let rows = test_file_reader_rows("old_list_structure.parquet", None).unwrap(); + let expected_rows = vec![row![( + "a".to_string(), + Field::ListInternal(make_list( + [ + make_list([1, 2].map(Field::Int).to_vec()), + make_list([3, 4].map(Field::Int).to_vec()) + ] + .map(Field::ListInternal) + .to_vec() + )) + ),]]; + assert_eq!(rows, expected_rows); + } } diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index b7ba95eb56bb..d062c175dd20 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -202,6 +202,29 @@ impl Type { self.get_basic_info().has_repetition() && self.get_basic_info().repetition() != Repetition::REQUIRED } + + /// Returns `true` if this type is annotated as a list. + pub(crate) fn is_list(&self) -> bool { + if self.is_group() { + let basic_info = self.get_basic_info(); + if let Some(logical_type) = basic_info.logical_type() { + return logical_type == LogicalType::List; + } + return basic_info.converted_type() == ConvertedType::LIST; + } + false + } + + /// Returns `true` if this type is a group with a single child field that is `repeated`. + pub(crate) fn has_single_repeated_child(&self) -> bool { + if self.is_group() { + let children = self.get_fields(); + return children.len() == 1 + && children[0].get_basic_info().has_repetition() + && children[0].get_basic_info().repetition() == Repetition::REPEATED; + } + false + } } /// A builder for primitive types. All attributes are optional