Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/main' into parquet-uuid-schema
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Feb 2, 2025
2 parents 75f56a4 + 43617b2 commit 4c62785
Show file tree
Hide file tree
Showing 50 changed files with 4,317 additions and 598 deletions.
54 changes: 54 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Creates a github release on https://github.com/apache/arrow-rs/releases
# when a tag is pushed to the repository
name: Release
on:
push:
tags:
- '*'
- '!*-rc*'
permissions:
contents: write
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
jobs:
publish:
name: Publish
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
- name: Create GitHub Releases
run: |
case "${GITHUB_REF_NAME}" in
object_store_*)
version=${GITHUB_REF_NAME#object_store_}
title="object_store ${version}"
notes_file=object_store/CHANGELOG.md
;;
*)
version=${GITHUB_REF_NAME}
title="arrow ${version}"
notes_file=CHANGELOG.md
;;
esac
gh release create ${GITHUB_REF_NAME} \
--title "${title}" \
--notes-file ${notes_file} \
--verify-tag
111 changes: 111 additions & 0 deletions CHANGELOG-old.md

Large diffs are not rendered by default.

173 changes: 74 additions & 99 deletions CHANGELOG.md

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ exclude = [
]

[workspace.package]
version = "54.0.0"
version = "54.1.0"
homepage = "https://github.com/apache/arrow-rs"
repository = "https://github.com/apache/arrow-rs"
authors = ["Apache Arrow <[email protected]>"]
Expand All @@ -77,20 +77,20 @@ edition = "2021"
rust-version = "1.70"

[workspace.dependencies]
arrow = { version = "54.0.0", path = "./arrow", default-features = false }
arrow-arith = { version = "54.0.0", path = "./arrow-arith" }
arrow-array = { version = "54.0.0", path = "./arrow-array" }
arrow-buffer = { version = "54.0.0", path = "./arrow-buffer" }
arrow-cast = { version = "54.0.0", path = "./arrow-cast" }
arrow-csv = { version = "54.0.0", path = "./arrow-csv" }
arrow-data = { version = "54.0.0", path = "./arrow-data" }
arrow-ipc = { version = "54.0.0", path = "./arrow-ipc" }
arrow-json = { version = "54.0.0", path = "./arrow-json" }
arrow-ord = { version = "54.0.0", path = "./arrow-ord" }
arrow-row = { version = "54.0.0", path = "./arrow-row" }
arrow-schema = { version = "54.0.0", path = "./arrow-schema" }
arrow-select = { version = "54.0.0", path = "./arrow-select" }
arrow-string = { version = "54.0.0", path = "./arrow-string" }
parquet = { version = "54.0.0", path = "./parquet", default-features = false }
arrow = { version = "54.1.0", path = "./arrow", default-features = false }
arrow-arith = { version = "54.1.0", path = "./arrow-arith" }
arrow-array = { version = "54.1.0", path = "./arrow-array" }
arrow-buffer = { version = "54.1.0", path = "./arrow-buffer" }
arrow-cast = { version = "54.1.0", path = "./arrow-cast" }
arrow-csv = { version = "54.1.0", path = "./arrow-csv" }
arrow-data = { version = "54.1.0", path = "./arrow-data" }
arrow-ipc = { version = "54.1.0", path = "./arrow-ipc" }
arrow-json = { version = "54.1.0", path = "./arrow-json" }
arrow-ord = { version = "54.1.0", path = "./arrow-ord" }
arrow-row = { version = "54.1.0", path = "./arrow-row" }
arrow-schema = { version = "54.1.0", path = "./arrow-schema" }
arrow-select = { version = "54.1.0", path = "./arrow-select" }
arrow-string = { version = "54.1.0", path = "./arrow-string" }
parquet = { version = "54.1.0", path = "./parquet", default-features = false }

chrono = { version = "0.4.34", default-features = false, features = ["clock"] }
22 changes: 9 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ as the [`parquet`] and [`parquet-derive`] crates.

This crate releases every month. We release new major versions (with potentially
breaking API changes) at most once a quarter, and release incremental minor
versions in the intervening months. See [this ticket] for more details.
versions in the intervening months. See [ticket #5368] for more details.

To keep our maintenance burden down, we do regularly scheduled releases (major
and minor) from the `main` branch. How we handle PRs with breaking API changes
Expand All @@ -63,16 +63,13 @@ is described in the [contributing] guide.

Planned Release Schedule

| Approximate Date | Version | Notes |
| ---------------- | -------- | ------------------------------------------ |
| Nov 2024 | `53.3.0` | Minor, NO breaking API changes |
| Dec 2024 | `54.0.0` | Major, potentially breaking API changes |
| Jan 2025 | `53.4.0` | Minor, NO breaking API changes (`53` line) |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |
| Approximate Date | Version | Notes |
| ---------------- | -------- | --------------------------------------- |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |

[this ticket]: https://github.com/apache/arrow-rs/issues/5368
[ticket #5368]: https://github.com/apache/arrow-rs/issues/5368
[semantic versioning]: https://semver.org/

### `object_store` crate
Expand All @@ -87,7 +84,6 @@ Planned Release Schedule

| Approximate Date | Version | Notes |
| ---------------- | -------- | --------------------------------------- |
| Dec 2024 | `0.11.2` | Minor, NO breaking API changes |
| Feb 2025 | `0.12.0` | Major, potentially breaking API changes |

### Guidelines for `panic` vs `Result`
Expand All @@ -96,9 +92,9 @@ In general, use panics for bad states that are unreachable, unrecoverable or har
For those caused by invalid user input, however, we prefer to report that invalidity
gracefully as an error result instead of panicking. In general, invalid input should result
in an `Error` as soon as possible. It _is_ ok for code paths after validation to assume
validation has already occurred and panic if not. See [this ticket] for more nuances.
validation has already occurred and panic if not. See [ticket #6737] for more nuances.

[this ticket]: https://github.com/apache/arrow-rs/issues/6737
[ticket #6737]: https://github.com/apache/arrow-rs/issues/6737

### Deprecation Guidelines

Expand Down
7 changes: 6 additions & 1 deletion arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ impl OffsetSizeTrait for i64 {
}

/// An array of [variable length lists], similar to JSON arrays
/// (e.g. `["A", "B", "C"]`).
/// (e.g. `["A", "B", "C"]`). This struct specifically represents
/// the [list layout]. Refer to [`GenericListViewArray`] for the
/// [list-view layout].
///
/// Lists are represented using `offsets` into a `values` child
/// array. Offsets are stored in two adjacent entries of an
Expand Down Expand Up @@ -123,7 +125,10 @@ impl OffsetSizeTrait for i64 {
/// ```
///
/// [`StringArray`]: crate::array::StringArray
/// [`GenericListViewArray`]: crate::array::GenericListViewArray
/// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
/// [list layout]: https://arrow.apache.org/docs/format/Columnar.html#list-layout
/// [list-view layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
data_type: DataType,
nulls: Option<NullBuffer>,
Expand Down
71 changes: 68 additions & 3 deletions arrow-array/src/array/list_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,81 @@ pub type ListViewArray = GenericListViewArray<i32>;
/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`.
pub type LargeListViewArray = GenericListViewArray<i64>;

/// An array of [variable length lists], specifically in the [list-view layout].
///
/// Different from [`crate::GenericListArray`] as it stores both an offset and length
/// meaning that take / filter operations can be implemented without copying the underlying data.
/// Differs from [`GenericListArray`] (which represents the [list layout]) in that
/// the sizes of the child arrays are explicitly encoded in a separate buffer, instead
/// of being derived from the difference between subsequent offsets in the offset buffer.
///
/// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout
/// This allows the offsets (and subsequently child data) to be out of order. It also
/// allows take / filter operations to be implemented without copying the underlying data.
///
/// # Representation
///
/// Given the same example array from [`GenericListArray`], it would be represented
/// as such via a list-view layout array:
///
/// ```text
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ┌ ─ ─ ─ ─ ─ ─ ┐ │
/// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
/// │ [A,B,C] │ │ (0,3) │ │ 1 │ │ 0 │ │ 3 │ │ │ 1 │ │ A │ │ 0 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ [] │ │ (3,0) │ │ 1 │ │ 3 │ │ 0 │ │ │ 1 │ │ B │ │ 1 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ NULL │ │ (?,?) │ │ 0 │ │ ? │ │ ? │ │ │ 1 │ │ C │ │ 2 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ [D] │ │ (4,1) │ │ 1 │ │ 4 │ │ 1 │ │ │ ? │ │ ? │ │ 3 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ [NULL, F] │ │ (5,2) │ │ 1 │ │ 5 │ │ 2 │ │ │ 1 │ │ D │ │ 4 │
/// └─────────────┘ └───────┘ │ └───┘ └───┘ └───┘ ├───┤ ├───┤
/// │ │ 0 │ │ ? │ │ 5 │
/// Logical Logical │ Validity Offsets Sizes ├───┤ ├───┤
/// Values Offset (nulls) │ │ 1 │ │ F │ │ 6 │
/// & Size │ └───┘ └───┘
/// │ Values │ │
/// (offsets[i], │ ListViewArray (Array)
/// sizes[i]) └ ─ ─ ─ ─ ─ ─ ┘ │
/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ```
///
/// Another way of representing the same array but taking advantage of the offsets being out of order:
///
/// ```text
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ┌ ─ ─ ─ ─ ─ ─ ┐ │
/// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
/// │ [A,B,C] │ │ (2,3) │ │ 1 │ │ 2 │ │ 3 │ │ │ 0 │ │ ? │ │ 0 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ [] │ │ (0,0) │ │ 1 │ │ 0 │ │ 0 │ │ │ 1 │ │ F │ │ 1 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ NULL │ │ (?,?) │ │ 0 │ │ ? │ │ ? │ │ │ 1 │ │ A │ │ 2 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ [D] │ │ (5,1) │ │ 1 │ │ 5 │ │ 1 │ │ │ 1 │ │ B │ │ 3 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤
/// │ [NULL, F] │ │ (0,2) │ │ 1 │ │ 0 │ │ 2 │ │ │ 1 │ │ C │ │ 4 │
/// └─────────────┘ └───────┘ │ └───┘ └───┘ └───┘ ├───┤ ├───┤
/// │ │ 1 │ │ D │ │ 5 │
/// Logical Logical │ Validity Offsets Sizes └───┘ └───┘
/// Values Offset (nulls) │ Values │ │
/// & Size │ (Array)
/// └ ─ ─ ─ ─ ─ ─ ┘ │
/// (offsets[i], │ ListViewArray
/// sizes[i]) │
/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ```
///
/// [`GenericListArray`]: crate::array::GenericListArray
/// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
/// [list layout]: https://arrow.apache.org/docs/format/Columnar.html#list-layout
/// [list-view layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout
#[derive(Clone)]
pub struct GenericListViewArray<OffsetSize: OffsetSizeTrait> {
data_type: DataType,
nulls: Option<NullBuffer>,
values: ArrayRef,
// Unlike GenericListArray, we do not use OffsetBuffer here as offsets are not
// guaranteed to be monotonically increasing.
value_offsets: ScalarBuffer<OffsetSize>,
value_sizes: ScalarBuffer<OffsetSize>,
}
Expand Down
23 changes: 23 additions & 0 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,29 @@ impl<'a> StringArrayType<'a> for &'a StringViewArray {
}
}

/// A trait for Arrow String Arrays, currently three types are supported:
/// - `BinaryArray`
/// - `LargeBinaryArray`
/// - `BinaryViewArray`
///
/// This trait helps to abstract over the different types of binary arrays
/// so that we don't need to duplicate the implementation for each type.
pub trait BinaryArrayType<'a>: ArrayAccessor<Item = &'a [u8]> + Sized {
/// Constructs a new iterator
fn iter(&self) -> ArrayIter<Self>;
}

impl<'a, O: OffsetSizeTrait> BinaryArrayType<'a> for &'a GenericBinaryArray<O> {
fn iter(&self) -> ArrayIter<Self> {
GenericBinaryArray::<O>::iter(self)
}
}
impl<'a> BinaryArrayType<'a> for &'a BinaryViewArray {
fn iter(&self) -> ArrayIter<Self> {
BinaryViewArray::iter(self)
}
}

impl PartialEq for dyn Array + '_ {
fn eq(&self, other: &Self) -> bool {
self.to_data().eq(&other.to_data())
Expand Down
16 changes: 16 additions & 0 deletions arrow-array/src/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,24 @@
//! RecordBatch::from(&builder.finish())
//! }
//! ```
//!
//! # Null / Validity Masks
//!
//! The [`NullBufferBuilder`] is optimized for creating the null mask for an array.
//!
//! ```
//! # use arrow_array::builder::NullBufferBuilder;
//! let mut builder = NullBufferBuilder::new(8);
//! let mut builder = NullBufferBuilder::new(8);
//! builder.append_n_non_nulls(7);
//! builder.append_null();
//! let buffer = builder.finish().unwrap();
//! assert_eq!(buffer.len(), 8);
//! assert_eq!(buffer.iter().collect::<Vec<_>>(), vec![true, true, true, true, true, true, true, false]);
//! ```
pub use arrow_buffer::BooleanBufferBuilder;
pub use arrow_buffer::NullBufferBuilder;

mod boolean_builder;
pub use boolean_builder::*;
Expand Down
18 changes: 17 additions & 1 deletion arrow-array/src/builder/primitive_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,11 @@ where
keys_builder.is_empty() && values_builder.is_empty(),
"keys and values builders must be empty"
);
let values_capacity = values_builder.capacity();
Self {
keys_builder,
values_builder,
map: HashMap::new(),
map: HashMap::with_capacity(values_capacity),
}
}

Expand Down Expand Up @@ -633,4 +634,19 @@ mod tests {

assert_eq!(values, [None, None]);
}

#[test]
fn creating_dictionary_from_builders_should_use_values_capacity_for_the_map() {
let builder = PrimitiveDictionaryBuilder::<Int32Type, crate::types::TimestampMicrosecondType>::new_from_empty_builders(
PrimitiveBuilder::with_capacity(1).with_data_type(DataType::Int32),
PrimitiveBuilder::with_capacity(2).with_data_type(DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+08:00".into()))),
);

assert!(
builder.map.capacity() >= builder.values_builder.capacity(),
"map capacity {} should be at least the values capacity {}",
builder.map.capacity(),
builder.values_builder.capacity()
)
}
}
Loading

0 comments on commit 4c62785

Please sign in to comment.