Disable Traces on Benchmark when in ServiceLab (pytorch#1804)

Summary: Pull Request resolved: pytorch#1804 Training benchmarks have been broken on trunk due to multiprocessing issues. This diff seems to have fixed it once and for all. Will await periodic experiment to complete successfully before modifying TARGETS for benchmark to run on all diffs Differential Revision: D55036955
PaulZhang12 · Mar 19, 2024 · bc4f0d8 · bc4f0d8
1 parent 2af5313
commit bc4f0d8
Showing 1 changed file with 50 additions and 47 deletions.
diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py
@@ -506,53 +506,54 @@ def benchmark(
         b = torch.cuda.max_memory_allocated(rank)
         max_mem_allocated.append(b // 1024 // 1024)
 
-    # pyre-ignore[2]
-    def trace_handler(prof) -> None:
-        total_average = prof.profiler.total_average()
-        logger.info(f" TOTAL_AVERAGE:\n{name}\n{total_average}")
-        dir_path: str = output_dir
-
-        # Don't output trace files if dir_path is empty
-        # or rank != 0, rank=-1 in no pg case, only 1 rank should output
-        # in pg case, so rank=0
-        if dir_path == "" or rank > 0:
-            return
-
-        trace_file: str = f"{dir_path}/trace-{name}.json"
-        stacks_cpu_file = f"{dir_path}/stacks-cpu-{name}.stacks"
-        stacks_cuda_file = f"{dir_path}/stacks-cuda-{name}.stacks"
-        logger.info(f" PROFILE[{name}].chrome_trace:{trace_file}")
-
-        prof.export_chrome_trace(trace_file)
-        prof.export_stacks(stacks_cpu_file, "self_cpu_time_total")
-        prof.export_stacks(stacks_cuda_file, "self_cuda_time_total")
-
-    # - git clone https://github.com/brendangregg/FlameGraph
-    # - cd FlameGraph
-    # - ./flamegraph.pl --title "CPU time" --countname "us." profiler.stacks > perf_viz.svg
-
-    with torch.profiler.profile(
-        activities=[
-            torch.profiler.ProfilerActivity.CPU,
-            torch.profiler.ProfilerActivity.CUDA,
-        ],
-        record_shapes=True,
-        profile_memory=True,
-        with_stack=True,
-        with_flops=True,
-        with_modules=True,
-        on_trace_ready=trace_handler,
-    ) as p:
-        for _input in prof_inputs:
-            with record_function("## forward ##"):
-                model(_input)
-                p.step()
-
-    if rank == -1:
-        for di in range(world_size):
-            torch.cuda.synchronize(di)
-    else:
-        torch.cuda.synchronize(rank)
+    if output_dir != "":
+        # Only do profiling if output_dir is set
+
+        # pyre-ignore[2]
+        def trace_handler(prof) -> None:
+            total_average = prof.profiler.total_average()
+            logger.info(f" TOTAL_AVERAGE:\n{name}\n{total_average}")
+            dir_path: str = output_dir
+
+            # only 1 rank should output in pg case, rank = 0
+            if rank > 0:
+                return
+
+            trace_file: str = f"{dir_path}/trace-{name}.json"
+            stacks_cpu_file = f"{dir_path}/stacks-cpu-{name}.stacks"
+            stacks_cuda_file = f"{dir_path}/stacks-cuda-{name}.stacks"
+            logger.info(f" PROFILE[{name}].chrome_trace:{trace_file}")
+
+            prof.export_chrome_trace(trace_file)
+            prof.export_stacks(stacks_cpu_file, "self_cpu_time_total")
+            prof.export_stacks(stacks_cuda_file, "self_cuda_time_total")
+
+        # - git clone https://github.com/brendangregg/FlameGraph
+        # - cd FlameGraph
+        # - ./flamegraph.pl --title "CPU time" --countname "us." profiler.stacks > perf_viz.svg
+
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=True,
+            with_flops=True,
+            with_modules=True,
+            on_trace_ready=trace_handler,
+        ) as p:
+            for _input in prof_inputs:
+                with record_function("## forward ##"):
+                    model(_input)
+                    p.step()
+
+            if rank == -1:
+                for di in range(torch.cuda.device_count()):
+                    torch.cuda.synchronize(torch.device(f"cuda:{di}"))
+            else:
+                torch.cuda.synchronize()
 
     return BenchmarkResult(
         short_name=name,
@@ -752,6 +753,8 @@ def benchmark_module(
         world_size: World size used in the
         num_benchmarks: How many times to run over benchmark inputs for statistics
         output_dir: Directory to output profiler outputs (traces, stacks)
+        func_to_benchmark: Custom function to benchmark, check out default_func_to_benchmark for default
+        benchmark_func_kwargs: Custom keyword arguments to pass to func_to_benchmark
 
     Returns:
         A list of BenchmarkResults