From 40e956f7daa6271a93809d5a73812b43e4356098 Mon Sep 17 00:00:00 2001 From: skrtskrtfb <72409736+skrtskrtfb@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:21:25 -0800 Subject: [PATCH] Add memory planning tests to Cadence Differential Revision: D67183235 Pull Request resolved: https://github.com/pytorch/executorch/pull/7431 --- backends/cadence/aot/TARGETS | 19 + .../cadence/aot/tests/test_memory_passes.py | 604 ++++++++++++++++++ 2 files changed, 623 insertions(+) create mode 100644 backends/cadence/aot/tests/test_memory_passes.py diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index 906b20bca0..b1484855d6 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -409,3 +409,22 @@ python_library( "//executorch/exir:tensor", ], ) + + +python_unittest( + name = "test_memory_passes", + srcs = [ + "tests/test_memory_passes.py", + ], + typing = True, + deps = [ + ":compiler", + ":memory_planning", + ":ops_registrations", + ":pass_utils", + "//caffe2:torch", + "//executorch/exir:memory", + "//executorch/exir/dialects:lib", + "//executorch/exir/tests:models", + ], +) diff --git a/backends/cadence/aot/tests/test_memory_passes.py b/backends/cadence/aot/tests/test_memory_passes.py new file mode 100644 index 0000000000..d1971ea605 --- /dev/null +++ b/backends/cadence/aot/tests/test_memory_passes.py @@ -0,0 +1,604 @@ +# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +import math +import unittest + +import executorch.backends.cadence.aot.ops_registrations # noqa +import torch +from executorch.backends.cadence.aot import compiler +from executorch.backends.cadence.aot.memory_planning import find_peak_memory_usage +from executorch.backends.cadence.aot.pass_utils import count_node +from executorch.exir import memory +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.tests.models import MultiLayerPerceptron + + +class TestMemPlanningPasses(unittest.TestCase): + def test_calculate_peak_memory_pass(self): + class PeakMemoryTestModel(torch.nn.Module): + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int): + super().__init__() + self.linear = torch.nn.Linear(input_dim, hidden_dim) + self.relu = torch.nn.ReLU() + self.linear2 = torch.nn.Linear(hidden_dim, output_dim) + + def forward(self, x: torch.Tensor): + x = self.linear(x) + x = self.relu(x) + x = self.linear2(x) + return x + + def calculate_aligned_num_bytes(num: int, alignment: int = 16): + return math.ceil(num / alignment) * alignment + + # model 1 + batch_size, input_dim, hidden_dim, output_dim = 3, 16, 10, 20 + + inputs = (torch.ones(batch_size, input_dim),) + model = PeakMemoryTestModel(input_dim, hidden_dim, output_dim) + + graph_module = ( + compiler.export_to_executorch_gen_etrecord(model, inputs) + .exported_program() + .graph_module + ) + + peak_usage, _ = find_peak_memory_usage( + graph_module, + mem_constraints=None, + alloc_graph_input=True, + alloc_graph_output=True, + ) + expected_peak_usage = calculate_aligned_num_bytes( + hidden_dim * batch_size * 4 + ) + calculate_aligned_num_bytes( + output_dim * batch_size * 4 + ) # Align data on a 16 byte boundary + self.assertEqual(peak_usage, expected_peak_usage) + + # model 2 + batch_size, input_dim, hidden_dim, output_dim = 11, 10, 16, 8 + + inputs = (torch.ones(batch_size, input_dim),) + model = MultiLayerPerceptron( + input_dim, hidden_dim, hidden_dim, hidden_dim, output_dim + ) + + graph_module = ( + compiler.export_to_executorch_gen_etrecord(model, inputs) + .exported_program() + .graph_module + ) + + peak_usage, _ = find_peak_memory_usage( + graph_module, + mem_constraints=None, + alloc_graph_input=True, + alloc_graph_output=True, + ) + + expected_peak_usage = 2 * calculate_aligned_num_bytes( + hidden_dim * batch_size * 4 + ) # Align data on a 16 byte boundary + self.assertEqual(peak_usage, expected_peak_usage) + + def test_zero_memory_pass(self): + class ZeroMem(torch.nn.Module): + def forward(self, x): + return x[:, 2::3, ...] + + x = torch.randn(2, 7, 3, 2) + + # Compiler with alloc_graph_input=False and alloc_graph_output=False. + # Cadence won't allocate memory for input and output, and the total memory + # usage will be 0 + executorch_prog = compiler.export_to_executorch_gen_etrecord( + ZeroMem(), + (x,), + alloc_graph_input=False, + alloc_graph_output=False, + ) + graph_module = executorch_prog.exported_program().graph_module + graph_module.graph.eliminate_dead_code() + peak_usage, _ = find_peak_memory_usage( + graph_module, + alloc_graph_input=False, + alloc_graph_output=False, + mem_constraints=None, + ) + self.assertEqual(peak_usage, 0) + + +class TestMemTransform(unittest.TestCase): + def test_optimize_cat(self): + class OptimizeCatFeasible1(torch.nn.Module): + def forward(self, x, y): + x1 = torch.add(x, 2.4, 3.1) + y1 = torch.add(y, 1, 2) + # Cat along the outermost dimension can be optimized away after + # adding constraints on the locations of x1 and y1. + return torch.ops.aten.cat((x1, y1)) + + x = torch.ones(3, 6) + y = torch.ones(2, 6) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + OptimizeCatFeasible1(), (x, y), opt_level=2, mem_algo=1 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is optimized away + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) + # Assert that cat op is replaced by its nop version post optimization + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) + + class OptimizeCatFeasible2(torch.nn.Module): + def forward(self, x, y): + x1 = torch.add(x, 2.4, 3.1) + y1 = torch.add(y, 1, 2) + # Cat along the outermost dimension can be optimized away after + # adding constraints on the locations of x1 and y1. + return torch.ops.aten.cat((x1, y1), 1) + + x = torch.ones(1, 3, 6) + y = torch.ones(1, 2, 6) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + OptimizeCatFeasible2(), (x, y), opt_level=2, mem_algo=1 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is optimized away + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) + # Assert that cat op is replaced by its nop version post optimization + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) + + class OptimizeCatInfeasible1(torch.nn.Module): + def forward(self, x, y): + x1 = torch.add(x, 2.4, 3.1) + y1 = torch.add(y, 1, 2) + # Cat along the outermost dimension can be optimized away after + # adding constraints on the locations of x1 and y1. + return torch.ops.aten.cat((x1, y1), 1) + + x = torch.ones(2, 4, 5) + y = torch.ones(2, 2, 5) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + OptimizeCatInfeasible1(), (x, y), opt_level=2, mem_algo=1 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is not optimized away, since the concat is not + # along the outermost dim + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) + + class OptimizeCatInfeasible2(torch.nn.Module): + def forward(self, x, y): + x1 = torch.add(x, 2.4, 3.1) + y1 = torch.add(y, 1, 2) + # Cat along the outermost dimension can be optimized away after + # adding constraints on the locations of x1 and y1. + return torch.ops.aten.cat((x1, y1), 0) + 2 + + x = torch.ones(5, 5) + y = torch.ones(3, 5) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + OptimizeCatInfeasible2(), (x, y), opt_level=2, mem_algo=1 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is not optimized away, since the concat relative + # offsets are not multiple of 8 bytes, and the cat is not the output + # of the graph. + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) + + def test_optimize_cat_with_slice(self): + class OptimizeCatSliceFeasible(torch.nn.Module): + def forward(self, x): + x1 = torch.add(x, 2.4, 3.1) + x2 = torch.ops.aten.slice(x, 0, 0, 1) + x3 = torch.ops.aten.cat((x1, x2)) + return torch.add(x3, x3) + + x = torch.randn(5, 6) + # Compile, and set alloc_graph_input to False so that slice op is not + # optimized away. + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + OptimizeCatSliceFeasible(), + (x,), + opt_level=2, + mem_algo=1, + alloc_graph_input=False, + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is optimized away + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) + + def test_optimize_cat_with_slice_infeasible(self): + class OptimizeCatSliceInfeasible(torch.nn.Module): + def forward(self, x, y): + x1 = torch.add(x, 2.4, 3.1) + y1 = torch.add(y, 1, 2) + y2 = torch.ops.aten.slice(y1, 0, 0, 1) + # Cat can't be optimized away if any of the tensor (e.g., y1) + # is slice_nop + return torch.ops.aten.cat((y2, x1)) + + x = torch.ones(3, 5) + y = torch.ones(2, 5) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + OptimizeCatSliceInfeasible(), (x, y), opt_level=2, mem_algo=1 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is not optimized away + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) + + def test_optimize_slice_Tensor(self): + class SliceTensor(torch.nn.Module): + def forward(self, x, y, z): + x1 = torch.add(x, 2.4, 3.1) + # This slice should always be optimized, since x1 is not placeholder + # and the slice is along the outermost dim + t1 = torch.ops.aten.slice(x1, 0, 1, 2) + # This slice should not be optimized when alloc_graph_input=False, + # since y is a placeholder node + t2 = torch.ops.aten.slice(y, 0, 0, 1) + # This slice should be always optimized, since the dims before + # sliced dims are 1 + z1 = torch.add(z, 2.4, 3.1) + t3 = torch.ops.aten.slice(z1, 1, 4, 5) + return (t1 + t2) * t3 + + x = torch.ones(3, 6) + y = torch.ones(2, 6) + z = torch.ones(1, 6) + # Run the memory planning pass and get the graph module + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + SliceTensor(), + (x, y, z), + opt_level=2, + mem_algo=1, + alloc_graph_input=False, + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that t2 is not optimized away + self.assertEqual( + count_node(graph_module, torch.ops.aten.slice_copy.Tensor_out), 1 + ) + # Assert that t1 and t3 are optimized to slice_copy_nop veresion + self.assertEqual( + count_node(graph_module, torch.ops.aten._slice_copy_nop.Tensor_out), 2 + ) + # When we compile with alloc_graph_input=True, all the slice ops must + # be optimized. + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + SliceTensor(), + (x, y, z), + opt_level=3, + mem_algo=1, + alloc_graph_input=True, + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + self.assertFalse(count_node(graph_module, torch.ops.aten.slice_copy.Tensor_out)) + self.assertEqual( + count_node(graph_module, torch.ops.aten._slice_copy_nop.Tensor_out), 3 + ) + + def test_optimize_select_Tensor(self): + class SelectTensor(torch.nn.Module): + def forward(self, x, y, z): + x1 = torch.add(x, 2.4, 3.1) + # This select should always be optimized, since x1 is not + # placeholder, and the select is along the outermost dim + t1 = torch.select_copy(x1, 0, 1) + # This select should not be optimized if alloc_graph_input=False, + # since y is a placeholder node. + t2 = torch.select_copy(y, 0, 0) + # This select should always be optimized, since the dims before + # select dims are 1 + z1 = torch.add(z, 2.4, 3.1) + t3 = torch.select(z1, 1, 4) + return (t1 + t2) * t3 + + x = torch.ones(3, 6) + y = torch.ones(2, 6) + z = torch.ones(1, 6) + # Optimizing select ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + SelectTensor(), + (x, y, z), + opt_level=2, + mem_algo=1, + alloc_graph_input=False, + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that t2 is not optimized away + self.assertEqual( + count_node(graph_module, torch.ops.aten.select_copy.int_out), 1 + ) + # Assert that t1 and t3 are optimized to select_copy_nop veresion + self.assertEqual( + count_node(graph_module, torch.ops.aten._select_copy_nop.int_out), 2 + ) + # When we compile with alloc_graph_input=True, all the select ops must + # be optimized. + # Optimizing select ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + SelectTensor(), + (x, y, z), + opt_level=3, + mem_algo=1, + alloc_graph_input=True, + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + self.assertEqual( + count_node(graph_module, torch.ops.aten.select_copy.int_out), 0 + ) + self.assertEqual( + count_node(graph_module, torch.ops.aten._select_copy_nop.int_out), 3 + ) + + # TODO: Test fails due to memory planning + @unittest.expectedFailure + def test_optimize_cat_with_param(self): + class CatWithPadding(torch.nn.Module): + def __init__(self, padding_shape): + super().__init__() + zeros = torch.zeros(padding_shape) + self.register_buffer("padding", zeros) + + def forward(self, x, y): + x1 = torch.add(x, 2.4, 3.1) + y1 = torch.add(y, 1, 2) + # Cat along the outermost dimension cannot be optimized away + # because padding is a param + return torch.ops.aten.cat((x1, y1, self.padding)) + + x = torch.ones(3, 5) + y = torch.ones(2, 5) + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + CatWithPadding((1, 5)), (x, y), opt_level=2 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is not optimized away + self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 1) + + def test_optimize_cat_with_view(self): + class CatViewFeasible(torch.nn.Module): + def forward(self, x, y): + x1 = torch.add(x, 2.4, 3.1) + x2 = x1.view((5, 3)) + y1 = torch.add(y, 2.4, 3.1) + y2 = y1.view((2, 3)) + # Cat can be optimized away since x2 and y2 are not mem-equivalent + return torch.ops.aten.cat((y2, x2)) + + x = torch.ones(3, 5) + y = torch.ones(3, 2) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + CatViewFeasible(), (x, y), opt_level=2, mem_algo=1 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is optimized away + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) + + def test_no_optimize_cat_with_repeated_args(self): + class CatViewInfeasible(torch.nn.Module): + def forward(self, x): + x1 = torch.add(x, 2.4, 3.1) + # Repeat will be decomposed into a cat. The cat cannot be optimized + # away since all its args are mem-equivalent + return torch.ops.aten.repeat(x1, [1, 2]) + + x = torch.ones(3, 5) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + CatViewInfeasible(), (x,), opt_level=2, mem_algo=1 + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is not optimized away + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 0) + + def test_no_optimize_cat_with_placeholder(self): + class CatViewInfeasible(torch.nn.Module): + def forward(self, x, y): + # Repeat will be decomposed into a cat. The cat cannot be optimized + # away since all its args are mem-equivalent + return torch.cat((x, y), dim=0) + + x = torch.ones(3, 5) + y = torch.ones(2, 5) + # Optimizing cat ops is only at opt_level 2+, and requires the memory planning + # pass to run: + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + CatViewInfeasible(), + (x, y), + opt_level=2, + mem_algo=1, + alloc_graph_input=False, + ) + .exported_program() + .graph_module + ) + graph_module.graph.eliminate_dead_code() + # Assert that cat op is not optimized away + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 0) + + def test_no_optimize_cat(self) -> None: + class Model(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, x) -> torch.Tensor: + x0 = torch.slice_copy(x, dim=0, start=0, end=4) + x0 = x0.view(-1) + x1 = torch.slice_copy(x, dim=0, start=4, end=8) + x1 = x1.view(-1) + return torch.cat((x0, x1), dim=0) + + model = Model() + inputs = (torch.randn(16, 16),) + + # Check that both view ops and slice copy are optimized. + # We can't optimize cat op in this case. + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + model, inputs, opt_level=3, alloc_graph_input=True + ) + .exported_program() + .graph_module + ) + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 0) + self.assertEqual( + count_node(graph_module, torch.ops.aten._slice_copy_nop.Tensor_out), 2 + ) + self.assertEqual(count_node(graph_module, memory.view), 2) + + def test_optimize_slice_copy(self) -> None: + class Model(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, x) -> torch.Tensor: + x0 = torch.slice_copy(x, dim=0, start=0, end=4) + x0 = x0.view(-1) + x1 = torch.slice_copy(x, dim=0, start=4, end=8) + x1 = x1.view(-1) + return torch.cat((x0, x1), dim=0) + + model = Model() + inputs = (torch.randn(16, 16),) + + # Check that view ops and cat are optimized. + # We can't optimize slice_copy op in this case. + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + model, inputs, opt_level=3, alloc_graph_input=False + ) + .exported_program() + .graph_module + ) + graph_module.print_readable() + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) + self.assertEqual( + count_node(graph_module, torch.ops.aten._slice_copy_nop.Tensor_out), 0 + ) + self.assertEqual(count_node(graph_module, memory.view), 2) + + def test_cat_then_cat(self) -> None: + class Model(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, x) -> torch.Tensor: + x1 = x + 1 + x2 = x1 + 1 + x3 = x2 + 1 + return torch.cat((torch.cat((x1, x2), dim=0), x3), dim=0) + + model = Model() + inputs = (torch.randn(16, 16),) + + # Check that both the cat ops can be optimized. + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + model, inputs, opt_level=3, alloc_graph_input=False + ) + .exported_program() + .graph_module + ) + graph_module.print_readable() + self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 2) + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) + + def test_view_for_unallocated_output(self): + class Model(torch.nn.Module): + def __init__(self, padding_shape): + super().__init__() + + def forward(self, x, y): + x = x + 1 + # x_view will be a memory.view. + x_view = torch.ops.aten.view_copy(x, [15]) + return x, x_view + y + + x = torch.ones(3, 5) + y = torch.ones(15) + # Check that memory planning passes for unallocated output `x`. + graph_module = ( + compiler.export_to_executorch_gen_etrecord( + Model((1, 5)), (x, y), opt_level=2, alloc_graph_output=False + ) + .exported_program() + .graph_module + ) + self.assertEqual(count_node(graph_module, memory.view), 1)