Skip to content

Commit

Permalink
Deprecate max statistics size properly (apache#14188)
Browse files Browse the repository at this point in the history
* deprecating max_statisics_size

* formatting by rustfmt

* Sprinkle more #[allow(deprecated)]

* update expected configs

* fix: add deprecation warning to information_schema

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
logan-keede and alamb authored Jan 25, 2025
1 parent 1dad545 commit 18f14ab
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 27 deletions.
78 changes: 54 additions & 24 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,38 +108,51 @@ use crate::{DataFusionError, Result};
/// ```
///
/// NB: Misplaced commas may result in nonsensical errors
#[macro_export]
macro_rules! config_namespace {
(
$(#[doc = $struct_d:tt])*
$vis:vis struct $struct_name:ident {
$(
$(#[doc = $d:tt])*
$field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr
)*$(,)*
}
$(#[doc = $struct_d:tt])* // Struct-level documentation attributes
$(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute
$(#[allow($($struct_de:tt)*)])?
$vis:vis struct $struct_name:ident {
$(
$(#[doc = $d:tt])* // Field-level documentation attributes
$(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
$(#[allow($($field_de:tt)*)])?
$field_vis:vis $field_name:ident : $field_type:ty,
$(warn = $warn:expr,)?
$(transform = $transform:expr,)?
default = $default:expr
)*$(,)*
}
) => {

$(#[doc = $struct_d])*
$(#[doc = $struct_d])* // Apply struct documentation
$(#[deprecated($($struct_depr)*)])? // Apply struct deprecation
$(#[allow($($struct_de)*)])?
#[derive(Debug, Clone, PartialEq)]
$vis struct $struct_name{
$vis struct $struct_name {
$(
$(#[doc = $d])*
$field_vis $field_name : $field_type,
$(#[doc = $d])* // Apply field documentation
$(#[deprecated($($field_depr)*)])? // Apply field deprecation
$(#[allow($($field_de)*)])?
$field_vis $field_name: $field_type,
)*
}

impl ConfigField for $struct_name {
fn set(&mut self, key: &str, value: &str) -> Result<()> {
let (key, rem) = key.split_once('.').unwrap_or((key, ""));

match key {
$(
stringify!($field_name) => {
$(let value = $transform(value);)?
$(log::warn!($warn);)?
self.$field_name.set(rem, value.as_ref())
},
stringify!($field_name) => {
// Safely apply deprecated attribute if present
// $(#[allow(deprecated)])?
{
$(let value = $transform(value);)? // Apply transformation if specified
$(log::warn!($warn);)? // Log warning if specified
#[allow(deprecated)]
self.$field_name.set(rem, value.as_ref())
}
},
)*
_ => return _config_err!(
"Config value \"{}\" not found on {}", key, stringify!($struct_name)
Expand All @@ -149,15 +162,16 @@ macro_rules! config_namespace {

fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
$(
let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
let desc = concat!($($d),*).trim();
self.$field_name.visit(v, key.as_str(), desc);
let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
let desc = concat!($($d),*).trim();
#[allow(deprecated)]
self.$field_name.visit(v, key.as_str(), desc);
)*
}
}

impl Default for $struct_name {
fn default() -> Self {
#[allow(deprecated)]
Self {
$($field_name: $default),*
}
Expand Down Expand Up @@ -467,6 +481,9 @@ config_namespace! {

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
/// max_statistics_size is deprecated, currently it is not being used
// TODO: remove once deprecated
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
pub max_statistics_size: Option<usize>, default = Some(4096)

/// (writing) Target maximum number of rows in each row group (defaults to 1M
Expand Down Expand Up @@ -1598,19 +1615,23 @@ impl ConfigField for TableParquetOptions {
macro_rules! config_namespace_with_hashmap {
(
$(#[doc = $struct_d:tt])*
$(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute
$vis:vis struct $struct_name:ident {
$(
$(#[doc = $d:tt])*
$(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
$field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr
)*$(,)*
}
) => {

$(#[doc = $struct_d])*
$(#[deprecated($($struct_depr)*)])? // Apply struct deprecation
#[derive(Debug, Clone, PartialEq)]
$vis struct $struct_name{
$(
$(#[doc = $d])*
$(#[deprecated($($field_depr)*)])? // Apply field deprecation
$field_vis $field_name : $field_type,
)*
}
Expand All @@ -1621,6 +1642,8 @@ macro_rules! config_namespace_with_hashmap {
match key {
$(
stringify!($field_name) => {
// Handle deprecated fields
#[allow(deprecated)] // Allow deprecated fields
$(let value = $transform(value);)?
self.$field_name.set(rem, value.as_ref())
},
Expand All @@ -1635,13 +1658,16 @@ macro_rules! config_namespace_with_hashmap {
$(
let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
let desc = concat!($($d),*).trim();
// Handle deprecated fields
#[allow(deprecated)]
self.$field_name.visit(v, key.as_str(), desc);
)*
}
}

impl Default for $struct_name {
fn default() -> Self {
#[allow(deprecated)]
Self {
$($field_name: $default),*
}
Expand All @@ -1653,7 +1679,7 @@ macro_rules! config_namespace_with_hashmap {
let parts: Vec<&str> = key.splitn(2, "::").collect();
match parts.as_slice() {
[inner_key, hashmap_key] => {
// Get or create the ColumnOptions for the specified column
// Get or create the struct for the specified key
let inner_value = self
.entry((*hashmap_key).to_owned())
.or_insert_with($struct_name::default);
Expand All @@ -1669,6 +1695,7 @@ macro_rules! config_namespace_with_hashmap {
$(
let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name));
let desc = concat!($($d),*).trim();
#[allow(deprecated)]
col_options.$field_name.visit(v, key.as_str(), desc);
)*
}
Expand Down Expand Up @@ -1720,6 +1747,9 @@ config_namespace_with_hashmap! {

/// Sets max statistics size for the column path. If NULL, uses
/// default parquet options
/// max_statistics_size is deprecated, currently it is not being used
// TODO: remove once deprecated
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
pub max_statistics_size: Option<usize>, default = None
}
}
Expand Down
10 changes: 9 additions & 1 deletion datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use crate::{
};

use arrow_schema::Schema;
// TODO: handle once deprecated
#[allow(deprecated)]
use parquet::{
arrow::ARROW_SCHEMA_META_KEY,
Expand Down Expand Up @@ -157,6 +158,9 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
}

// max_statistics_size is deprecated, currently it is not being used
// TODO: remove once deprecated
#[allow(deprecated)]
if let Some(max_statistics_size) = options.max_statistics_size {
builder = {
#[allow(deprecated)]
Expand Down Expand Up @@ -202,6 +206,7 @@ impl ParquetOptions {
///
/// Note that this method does not include the key_value_metadata from [`TableParquetOptions`].
pub fn into_writer_properties_builder(&self) -> Result<WriterPropertiesBuilder> {
#[allow(deprecated)]
let ParquetOptions {
data_pagesize_limit,
write_batch_size,
Expand Down Expand Up @@ -452,6 +457,7 @@ mod tests {
fn column_options_with_non_defaults(
src_col_defaults: &ParquetOptions,
) -> ParquetColumnOptions {
#[allow(deprecated)] // max_statistics_size
ParquetColumnOptions {
compression: Some("zstd(22)".into()),
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
Expand All @@ -472,6 +478,7 @@ mod tests {
"1.0"
};

#[allow(deprecated)] // max_statistics_size
ParquetOptions {
data_pagesize_limit: 42,
write_batch_size: 42,
Expand Down Expand Up @@ -515,6 +522,7 @@ mod tests {
) -> ParquetColumnOptions {
let bloom_filter_default_props = props.bloom_filter_properties(&col);

#[allow(deprecated)] // max_statistics_size
ParquetColumnOptions {
bloom_filter_enabled: Some(bloom_filter_default_props.is_some()),
encoding: props.encoding(&col).map(|s| s.to_string()),
Expand All @@ -535,7 +543,6 @@ mod tests {
),
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
#[allow(deprecated)]
max_statistics_size: Some(props.max_statistics_size(&col)),
}
}
Expand Down Expand Up @@ -569,6 +576,7 @@ mod tests {
HashMap::from([(COL_NAME.into(), configured_col_props)])
};

#[allow(deprecated)] // max_statistics_size
TableParquetOptions {
global: ParquetOptions {
// global options
Expand Down
2 changes: 2 additions & 0 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
fn try_from(
value: &protobuf::ParquetOptions,
) -> datafusion_common::Result<Self, Self::Error> {
#[allow(deprecated)] // max_statistics_size
Ok(ParquetOptions {
enable_page_index: value.enable_page_index,
pruning: value.pruning,
Expand Down Expand Up @@ -1013,6 +1014,7 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
fn try_from(
value: &protobuf::ParquetColumnOptions,
) -> datafusion_common::Result<Self, Self::Error> {
#[allow(deprecated)] // max_statistics_size
Ok(ParquetColumnOptions {
compression: value.compression_opt.clone().map(|opt| match opt {
protobuf::parquet_column_options::CompressionOpt::Compression(v) => Some(v),
Expand Down
2 changes: 2 additions & 0 deletions datafusion/proto-common/src/to_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled),
dictionary_page_size_limit: value.dictionary_page_size_limit as u64,
statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled),
#[allow(deprecated)]
max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)),
max_row_group_size: value.max_row_group_size as u64,
created_by: value.created_by.clone(),
Expand Down Expand Up @@ -858,6 +859,7 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions {
.statistics_enabled
.clone()
.map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled),
#[allow(deprecated)]
max_statistics_size_opt: value.max_statistics_size.map(|v| {
protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(
v as u32,
Expand Down
3 changes: 3 additions & 0 deletions datafusion/proto/src/logical_plan/file_formats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ impl TableParquetOptionsProto {
};

let column_specific_options = global_options.column_specific_options;
#[allow(deprecated)] // max_statistics_size
TableParquetOptionsProto {
global: Some(ParquetOptionsProto {
enable_page_index: global_options.global.enable_page_index,
Expand Down Expand Up @@ -455,6 +456,7 @@ impl TableParquetOptionsProto {

impl From<&ParquetOptionsProto> for ParquetOptions {
fn from(proto: &ParquetOptionsProto) -> Self {
#[allow(deprecated)] // max_statistics_size
ParquetOptions {
enable_page_index: proto.enable_page_index,
pruning: proto.pruning,
Expand Down Expand Up @@ -509,6 +511,7 @@ impl From<&ParquetOptionsProto> for ParquetOptions {

impl From<ParquetColumnOptionsProto> for ParquetColumnOptions {
fn from(proto: ParquetColumnOptionsProto) -> Self {
#[allow(deprecated)] // max_statistics_size
ParquetColumnOptions {
bloom_filter_enabled: proto.bloom_filter_enabled_opt.map(
|parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v,
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b
datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.
datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting
datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.
datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting
datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used
datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer
Expand Down
Loading

0 comments on commit 18f14ab

Please sign in to comment.