From b0c7aae923016246d31629a823486cd81b35223e Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Sat, 25 Jan 2025 12:11:28 -0800 Subject: [PATCH] [xla:cpu:cnn] Add ParallelTask structs to improve performance debugging experience Named structs instead of lambdas give a much better debugging experience. PiperOrigin-RevId: 719688160 --- .../runtime/xnnpack/parallel_loop_runner.cc | 76 +++++++++++++------ .../runtime/xnnpack/parallel_loop_runner.h | 6 ++ xla/backends/cpu/xnn_fusion.cc | 24 ++++-- 3 files changed, 78 insertions(+), 28 deletions(-) diff --git a/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.cc b/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.cc index f974be079faa25..429151e422c607 100644 --- a/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.cc +++ b/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.cc @@ -214,6 +214,17 @@ static Task3DTile2DIndex Delinearize(size_t task_index, size_t range_i, // // (2) If done event is not available, we have to overwrite it with a new one // that will be set to concrete state after the task is executed. +// +// We wrap all tasks into structs conforming to the `ParallelTest` API, so that +// in profiles we can see human-readable names of the tasks instead of lambdas. + +struct ParallelLoopRunner::ParallelTask1D { + ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { + task(task_index); + } + + Task1D task; +}; void ParallelLoopRunner::Parallelize(size_t range, Task1D task) { DCHECK(done_event_) << "Parallel loop runner is in moved-from state"; @@ -232,9 +243,20 @@ void ParallelLoopRunner::Parallelize(size_t range, Task1D task) { return; } - ScheduleAll(range, std::move(task)); + ScheduleAll(range, ParallelTask1D{std::move(task)}); } +struct ParallelLoopRunner::ParallelTask1DTile1D { + ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { + auto x = Delinearize(task_index, range, tile); + task(x.offset, x.extent); + } + + size_t range; + size_t tile; + Task1DTile1D task; +}; + void ParallelLoopRunner::Parallelize(size_t range, size_t tile, Task1DTile1D task) { DCHECK(done_event_) << "Parallel loop runner is in moved-from state"; @@ -255,15 +277,21 @@ void ParallelLoopRunner::Parallelize(size_t range, size_t tile, return; } - auto parallel_task = [range, tile, - task = std::move(task)](size_t task_index) { - auto x = Delinearize(task_index, range, tile); - task(x.offset, x.extent); - }; - - ScheduleAll(num_tasks, std::move(parallel_task)); + ScheduleAll(num_tasks, ParallelTask1DTile1D{range, tile, std::move(task)}); } +struct ParallelLoopRunner::ParallelTask2DTile1D { + ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { + auto x = Delinearize(task_index, range_i, range_j, tile_j); + task(x.i, x.offset_j, x.extent_j); + } + + size_t range_i; + size_t range_j; + size_t tile_j; + Task2DTile1D task; +}; + void ParallelLoopRunner::Parallelize(size_t range_i, size_t range_j, size_t tile_j, Task2DTile1D task) { DCHECK(done_event_) << "Parallel loop runner is in moved-from state"; @@ -282,15 +310,24 @@ void ParallelLoopRunner::Parallelize(size_t range_i, size_t range_j, return; } - auto parallel_task = [range_i, range_j, tile_j, - task = std::move(task)](size_t task_index) { - auto x = Delinearize(task_index, range_i, range_j, tile_j); - task(x.i, x.offset_j, x.extent_j); - }; - - ScheduleAll(num_tasks, std::move(parallel_task)); + ScheduleAll(num_tasks, + ParallelTask2DTile1D{range_i, range_j, tile_j, std::move(task)}); } +struct ParallelLoopRunner::ParallelTask3DTile2D { + ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { + auto x = Delinearize(task_index, range_i, range_j, range_k, tile_j, tile_k); + task(x.i, x.offset_j, x.offset_k, x.extent_j, x.extent_k); + } + + size_t range_i; + size_t range_j; + size_t range_k; + size_t tile_j; + size_t tile_k; + Task3DTile2D task; +}; + void ParallelLoopRunner::Parallelize(size_t range_i, size_t range_j, size_t range_k, size_t tile_j, size_t tile_k, Task3DTile2D task) { @@ -312,13 +349,8 @@ void ParallelLoopRunner::Parallelize(size_t range_i, size_t range_j, return; } - auto parallel_task = [range_i, range_j, range_k, tile_j, tile_k, - task = std::move(task)](size_t task_index) { - auto x = Delinearize(task_index, range_i, range_j, range_k, tile_j, tile_k); - task(x.i, x.offset_j, x.offset_k, x.extent_j, x.extent_k); - }; - - ScheduleAll(num_tasks, std::move(parallel_task)); + ScheduleAll(num_tasks, ParallelTask3DTile2D{range_i, range_j, range_k, tile_j, + tile_k, std::move(task)}); } } // namespace xla::cpu diff --git a/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h b/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h index b985ba63f976d5..01558e60ee5ee3 100644 --- a/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h +++ b/xla/backends/cpu/runtime/xnnpack/parallel_loop_runner.h @@ -121,6 +121,12 @@ class ParallelLoopRunner { size_t num_threads() const; private: + // Forward declarations of the parallel tasks. + struct ParallelTask1D; + struct ParallelTask1DTile1D; + struct ParallelTask2DTile1D; + struct ParallelTask3DTile2D; + // Schedules `task` as the AndThen callback of the `done_event_`. Updates // `done_event_` to the new completion event. template diff --git a/xla/backends/cpu/xnn_fusion.cc b/xla/backends/cpu/xnn_fusion.cc index b0d12ca0d78a0a..0f0887ba3978f2 100644 --- a/xla/backends/cpu/xnn_fusion.cc +++ b/xla/backends/cpu/xnn_fusion.cc @@ -36,19 +36,31 @@ namespace xla::cpu { static constexpr int64_t kDotThreshold = 10 * 1000; static constexpr int64_t kDefaultThreshold = 100 * 1000; -// We rely on a very simple heuristic to determine if thread pool is beneficial -// for XNNPACK fusions. We assume that if the HLO produces a large result, -// thread pool will be beneficial for running operation in parallel. For small -// operations, thread pool overheads are higher than the actual computation. -static int64_t MaxElementsCount(const HloInstruction* hlo) { +static int64_t MaxElementsCount(const Shape& shape) { int64_t ret = 0; ShapeUtil::ForEachSubshape( - hlo->shape(), [&](const Shape& shape, const ShapeIndex& index) { + shape, [&](const Shape& shape, const ShapeIndex& index) { ret = std::max(ret, ShapeUtil::ElementsIn(shape)); }); return ret; } +// We rely on a very simple heuristic to determine if thread pool is beneficial +// for XNNPACK fusions. We assume that if the HLO produces a large result (or +// has large operands), thread pool will be beneficial for running operation in +// parallel. For small operations, thread pool overheads are higher than the +// actual computation. +static int64_t MaxElementsCount(const HloInstruction* hlo, + bool include_operands = true) { + int64_t ret = MaxElementsCount(hlo->shape()); + if (include_operands) { + for (auto* operand : hlo->operands()) { + ret = std::max(ret, MaxElementsCount(operand->shape())); + } + } + return ret; +} + bool XnnShouldUseThreadPool(const HloInstruction* hlo) { switch (hlo->opcode()) { case HloOpcode::kDot: