[XLA:GPU] Rename `xla_gpu_enable_experimental_pipeline_parallelism_op…

…t` to `xla_gpu_experimental_enable_pipeline_parallelism_opt`. This is to follow the agreed upon flag nomenclature. PiperOrigin-RevId: 717879485
openxla · Jan 23, 2025 · 6bf6dd8 · 6bf6dd8
1 parent 13125fb
commit 6bf6dd8
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 15 deletions.
diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc
@@ -205,7 +205,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_collective_permute_decomposer_threshold(
       std::numeric_limits<int64_t>::max());
-  opts.set_xla_gpu_enable_experimental_pipeline_parallelism_opt(false);
+  opts.set_xla_gpu_experimental_enable_pipeline_parallelism_opt(false);
 
   opts.set_xla_cpu_enable_mlir_tiling_and_fusion(true);
   opts.set_xla_cpu_enable_custom_matmul_tiling(false);
@@ -1713,11 +1713,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_collective_permute_decomposer_threshold(),
       "Collective permute decomposer threshold."));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_experimental_pipeline_parallelism_opt",
+      "xla_gpu_experimental_enable_pipeline_parallelism_opt",
       bool_setter_for(
           &DebugOptions::
-              set_xla_gpu_enable_experimental_pipeline_parallelism_opt),
-      debug_options->xla_gpu_enable_experimental_pipeline_parallelism_opt(),
+              set_xla_gpu_experimental_enable_pipeline_parallelism_opt),
+      debug_options->xla_gpu_experimental_enable_pipeline_parallelism_opt(),
       "Experimental optimizations for SPMD-based pipeline parallelism on "
       "GPU."));
   flag_list->push_back(tsl::Flag(

diff --git a/xla/service/gpu/gpu_compiler.cc b/xla/service/gpu/gpu_compiler.cc
@@ -954,7 +954,7 @@ absl::Status RunCollectiveOptimizationPasses(
 
   if (hlo_module->config()
           .debug_options()
-          .xla_gpu_enable_experimental_pipeline_parallelism_opt()) {
+          .xla_gpu_experimental_enable_pipeline_parallelism_opt()) {
     collectives_pipeline.AddPass<CollectiveSelectFolder>();
   }
 
@@ -971,7 +971,7 @@ absl::Status RunCollectiveOptimizationPasses(
         collectives_pipeline,
         hlo_module->config()
             .debug_options()
-            .xla_gpu_enable_experimental_pipeline_parallelism_opt());
+            .xla_gpu_experimental_enable_pipeline_parallelism_opt());
   }
 
   // Run algebraic simplifier to reshape(broadcast) into a broadcast when
@@ -2669,7 +2669,7 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
 
     if (!module->config()
              .debug_options()
-             .xla_gpu_enable_experimental_pipeline_parallelism_opt() &&
+             .xla_gpu_experimental_enable_pipeline_parallelism_opt() &&
         (module->config()
              .debug_options()
              .xla_gpu_enable_pipelined_collectives() ||

diff --git a/xla/service/gpu/gpu_latency_hiding_scheduler.cc b/xla/service/gpu/gpu_latency_hiding_scheduler.cc
@@ -272,13 +272,13 @@ void GpuAsyncTrackerBase::PostProcessScheduleGraph(
     // Schedule partially pipelined send/recv instructions late so that they can
     // overlap with compute. Schedule send/recv late and, when unblocked,
     // schedule send-done/recv-done early.
-    if (debug_options.xla_gpu_enable_experimental_pipeline_parallelism_opt() &&
+    if (debug_options.xla_gpu_experimental_enable_pipeline_parallelism_opt() &&
         IsPartiallyPipelinedSendRecv(inst)) {
       HloGraphNode& node = schedule_graph->GetNode(inst);
       node.SetForceDelay(true);
       VLOG(5) << "Setting force delay for instruction: " << inst->ToString();
     }
-    if (debug_options.xla_gpu_enable_experimental_pipeline_parallelism_opt() &&
+    if (debug_options.xla_gpu_experimental_enable_pipeline_parallelism_opt() &&
         IsPartiallyPipelinedSendRecvDone(inst)) {
       HloGraphNode& node = schedule_graph->GetNode(inst);
       node.SetForceEarly(true);

diff --git a/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -84,7 +84,7 @@ class GpuLatencyHidingSchedulerBaseTest : public HloTestBase {
     HloModuleConfig config;
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_latency_hiding_scheduler(true);
-    debug_options.set_xla_gpu_enable_experimental_pipeline_parallelism_opt(
+    debug_options.set_xla_gpu_experimental_enable_pipeline_parallelism_opt(
         enable_experimental_pipeline_parallelism_opt);
     config.set_debug_options(debug_options);
     config.set_fdo_profile(fdo_profile);

diff --git a/xla/tests/collective_pipeline_parallelism_test.cc b/xla/tests/collective_pipeline_parallelism_test.cc
@@ -56,7 +56,7 @@ class CollectivePipelineParallelismTest
 
     // Set debug options.
     DebugOptions debug_options = GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_experimental_pipeline_parallelism_opt(
+    debug_options.set_xla_gpu_experimental_enable_pipeline_parallelism_opt(
         GetParam());
     config.set_debug_options(debug_options);
 

diff --git a/xla/xla.proto b/xla/xla.proto
@@ -386,10 +386,6 @@ message DebugOptions {
   // dynamic-update-slice operations around library calls.
   bool xla_gpu_enable_dynamic_slice_fusion = 105;
 
-  // Experimental optimizations for SPMD-based pipeline parallelism on GPU.
-  // TODO(bchetioui): adjust this name to follow the naming convention.
-  bool xla_gpu_enable_experimental_pipeline_parallelism_opt = 351;
-
   // When true we lower the Minimum and Maximum hlos in the GPU backend such
   // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
   // this is true we don't propagate NaNs through Min and Max.
@@ -513,6 +509,9 @@ message DebugOptions {
   // Pre-existing block-level fusions are left unmodified.
   bool xla_gpu_experimental_enable_fusion_block_level_rewriter = 334;
 
+  // Experimental optimizations for SPMD-based pipeline parallelism on GPU.
+  bool xla_gpu_experimental_enable_pipeline_parallelism_opt = 351;
+
   // When enabled, the PriorityFusion pass will try to make Triton fusions first
   // and foremost where it is possible.
   //