Skip to content

Commit

Permalink
Increase batch size x4
Browse files Browse the repository at this point in the history
  • Loading branch information
Dandandan committed Dec 1, 2023
1 parent 845c5d2 commit c2670bf
Show file tree
Hide file tree
Showing 15 changed files with 21 additions and 21 deletions.
2 changes: 1 addition & 1 deletion benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ Example output:

```
Running benchmarks with the following options: Opt { debug: false, iterations: 3, partitions: 2, path: "./data",
batch_size: 8192, scale_factor: 1.0 }
batch_size: 32768, scale_factor: 1.0 }
Generated test dataset with 10699521 rows
Executing with filter 'request_method = Utf8("GET")'
Using scan options ParquetScanOptions { pushdown_filters: false, reorder_predicates: false, enable_page_index: false }
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/src/parquet_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ use structopt::StructOpt;
///
/// Example output:
///
/// Running benchmarks with the following options: Opt { debug: false, iterations: 3, partitions: 2, path: "./data", batch_size: 8192, scale_factor: 1.0 }
/// Running benchmarks with the following options: Opt { debug: false, iterations: 3, partitions: 2, path: "./data", batch_size: 32768, scale_factor: 1.0 }
/// Generated test dataset with 10699521 rows
/// Executing with filter 'request_method = Utf8("GET")'
/// Using scan options ParquetScanOptions { pushdown_filters: false, reorder_predicates: false, enable_page_index: false }
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/src/tpch/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ mod tests {
let common = CommonOpt {
iterations: 1,
partitions: Some(2),
batch_size: 8192,
batch_size: 32768,
debug: false,
};
let opt = RunOpt {
Expand Down Expand Up @@ -358,7 +358,7 @@ mod tests {
let common = CommonOpt {
iterations: 1,
partitions: Some(2),
batch_size: 8192,
batch_size: 32768,
debug: false,
};
let opt = RunOpt {
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/csv_opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ async fn main() -> Result<()> {
let schema = aggr_test_schema();

let config = CsvConfig::new(
8192,
32768,
schema.clone(),
Some(vec![12, 0]),
true,
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/json_opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async fn main() -> Result<()> {
let projected = Arc::new(schema.clone().project(&[1, 0])?);

let opener = JsonOpener::new(
8192,
32768,
projected,
FileCompressionType::UNCOMPRESSED,
Arc::new(object_store),
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/benches/distinct_query_sql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ fn create_context(
fn criterion_benchmark_limited_distinct(c: &mut Criterion) {
let partitions_len = 10;
let array_len = 1 << 26; // 64 M
let batch_size = 8192;
let batch_size = 32768;
let ctx = create_context(partitions_len, array_len, batch_size).unwrap();

let mut group = c.benchmark_group("custom-measurement-time");
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/tests/config_from_env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ fn from_env() {

env::remove_var(env_key);
let config = ConfigOptions::from_env().unwrap();
assert_eq!(config.execution.batch_size, 8192); // set to its default value
assert_eq!(config.execution.batch_size, 32768); // set to its default value
}
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -3066,7 +3066,7 @@ from aggregate_test_100;
0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695

statement ok
set datafusion.execution.batch_size = 8192;
set datafusion.execution.batch_size = 32768;



Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/explain.slt
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ Dml: op=[Insert Into] table=[sink_table]
----Sort: aggregate_test_100.c1 ASC NULLS LAST
------TableScan: aggregate_test_100 projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13]
physical_plan
FileSinkExec: sink=StreamWrite { location: "../../testing/data/csv/aggregate_test_100.csv", batch_size: 8192, encoding: Csv, header: true, .. }
FileSinkExec: sink=StreamWrite { location: "../../testing/data/csv/aggregate_test_100.csv", batch_size: 32768, encoding: Csv, header: true, .. }
--SortExec: expr=[c1@0 ASC NULLS LAST]
----CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], has_header=true

Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ datafusion.catalog.has_header false
datafusion.catalog.information_schema true
datafusion.catalog.location NULL
datafusion.execution.aggregate.scalar_update_factor 10
datafusion.execution.batch_size 8192
datafusion.execution.batch_size 32768
datafusion.execution.coalesce_batches true
datafusion.execution.collect_statistics false
datafusion.execution.max_buffered_batches_per_output_file 2
Expand Down Expand Up @@ -220,7 +220,7 @@ datafusion.catalog.has_header false If the file has a header
datafusion.catalog.information_schema true Should DataFusion provide access to `information_schema` virtual tables for displaying schema information
datafusion.catalog.location NULL Location scanned to load tables for `default` schema
datafusion.execution.aggregate.scalar_update_factor 10 Specifies the threshold for using `ScalarValue`s to update accumulators during high-cardinality aggregations for each input batch. The aggregation is considered high-cardinality if the number of affected groups is greater than or equal to `batch_size / scalar_update_factor`. In such cases, `ScalarValue`s are utilized for updating accumulators, rather than the default batch-slice approach. This can lead to performance improvements. By adjusting the `scalar_update_factor`, you can balance the trade-off between more efficient accumulator updates and the number of groups affected.
datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
datafusion.execution.batch_size 32768 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting
datafusion.execution.collect_statistics false Should DataFusion collect statistics after listing files
datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption
Expand Down Expand Up @@ -285,13 +285,13 @@ datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser
query TT
SHOW datafusion.execution.batch_size
----
datafusion.execution.batch_size 8192
datafusion.execution.batch_size 32768

# show_variable_in_config_options_verbose
query TTT
SHOW datafusion.execution.batch_size VERBOSE
----
datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
datafusion.execution.batch_size 32768 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption

# show_time_zone_default_utc
# https://github.com/apache/arrow-datafusion/issues/3255
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/join.slt
Original file line number Diff line number Diff line change
Expand Up @@ -593,4 +593,4 @@ drop table IF EXISTS full_join_test;

# batch size
statement ok
set datafusion.execution.batch_size = 8192;
set datafusion.execution.batch_size = 32768;
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/options.slt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ CoalesceBatchesExec: target_batch_size=1234


statement ok
set datafusion.execution.batch_size = 8192;
set datafusion.execution.batch_size = 32768;

statement ok
drop table a
Expand Down
6 changes: 3 additions & 3 deletions docs/source/user-guide/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ Available commands inside DataFusion CLI are:
+-------------------------------------------------+---------+
| name | value |
+-------------------------------------------------+---------+
| datafusion.execution.batch_size | 8192 |
| datafusion.execution.batch_size | 32768 |
| datafusion.execution.coalesce_batches | true |
| datafusion.execution.time_zone | UTC |
| datafusion.explain.logical_plan_only | false |
Expand All @@ -426,7 +426,7 @@ Available commands inside DataFusion CLI are:
+-------------------------------------------------+---------+
| name | value |
+-------------------------------------------------+---------+
| datafusion.execution.batch_size | 8192 |
| datafusion.execution.batch_size | 32768 |
+-------------------------------------------------+---------+

```
Expand Down Expand Up @@ -477,7 +477,7 @@ DataFusion CLI v13.0.0
+---------------------------------+---------+
| name | value |
+---------------------------------+---------+
| datafusion.execution.batch_size | 8192 |
| datafusion.execution.batch_size | 32768 |
+---------------------------------+---------+
1 row in set. Query took 0.011 seconds.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema |
| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema |
| datafusion.catalog.has_header | false | If the file has a header |
| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption |
| datafusion.execution.batch_size | 32768 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption |
| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting |
| datafusion.execution.collect_statistics | false | Should DataFusion collect statistics after listing files |
| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system |
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/sql/information_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ To show the current session configuration options, use the `SHOW ALL` command or
+-------------------------------------------------+---------+
| name | setting |
+-------------------------------------------------+---------+
| datafusion.execution.batch_size | 8192 |
| datafusion.execution.batch_size | 32768 |
| datafusion.execution.coalesce_batches | true |
| datafusion.execution.time_zone | UTC |
| datafusion.explain.logical_plan_only | false |
Expand Down

0 comments on commit c2670bf

Please sign in to comment.