DRAFT CFe fuse Mul Add to Fullyconnected

on-going draft to fuse Mul Add to Fullyconnected. Signed-off-by: SaeHie Park <[email protected]>
Samsung · Jul 16, 2024 · d57d842 · d57d842
1 parent ab68724
commit d57d842
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 3 deletions.
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -21,6 +21,9 @@ Add(MaxPoolWithArgmax_000 PASS resolve_customop_max_pool_with_argmax)
 Add(MaxPoolWithArgmax_001 PASS resolve_customop_max_pool_with_argmax)
 Add(MaxPoolWithArgmax_002 PASS resolve_customop_max_pool_with_argmax)
 Add(Net_Add_FloorMod_Gather_000 PASS remove_gather_guard)
+Add(Net_Add_FullyConnected_000 PASS fuse_add_to_fullyconnected_bias)
+Add(Net_Add_FullyConnected_001 PASS fuse_add_to_fullyconnected_bias)
+Add(Net_Add_FullyConnected_002 PASS fuse_add_to_fullyconnected_bias)
 Add(Net_BroadcastTo_AddV2_000 PASS resolve_customop_add)
 Add(Net_BroadcastTo_AddV2_001 PASS resolve_customop_add)
 Add(Net_BroadcastTo_AddV2_002 PASS resolve_customop_add)
@@ -61,6 +64,9 @@ Add(Net_Mul_Add_002 PASS remove_unnecessary_add)
 Add(Net_Mul_Add_003 PASS remove_unnecessary_add)
 Add(Net_Mul_Div_000 PASS fuse_mul_with_div)
 Add(Net_Mul_Div_001 PASS fuse_mul_with_div)
+Add(Net_Mul_FullyConnected_000 PASS fuse_mul_to_fullyconnected_weights fold_mul)
+Add(Net_Mul_FullyConnected_001 PASS fuse_mul_to_fullyconnected_weights fold_mul)
+Add(Net_Mul_FullyConnected_002 PASS fuse_mul_to_fullyconnected_weights fold_mul)
 Add(Net_Preactivation_BN_000 PASS fuse_preactivation_batchnorm)
 Add(Net_Reshape_Reshape_000 PASS remove_redundant_reshape)
 Add(Net_Shape_Add_000 PASS fold_shape)

diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -81,6 +81,7 @@ int entry(int argc, char **argv)
   add_switch(arser, "--fold_fully_connected",
              "This will fold FullyConnected operator with constant inputs");
   add_switch(arser, "--fold_gather", "This will fold Gather operator");
+  add_switch(arser, "--fold_mul", "This will fold Mul operator");
   add_switch(arser, "--fold_reshape", "This will fold Reshape operator");
   add_switch(arser, "--fold_shape", "This will fold Shape operator");
   add_switch(arser, "--fold_sparse_to_dense", "This will fold SparseToDense operator");
@@ -93,6 +94,8 @@ int entry(int argc, char **argv)
              "This will fuse Activation function to a preceding operator");
   add_switch(arser, "--fuse_horizontal_fc_layers",
              "This will fuse horizontal FullyConnected layers");
+  add_switch(arser, "--fuse_add_to_fullyconnected_bias",
+             "This will fuse Add to following FullyConnected bias");
   add_switch(arser, "--fuse_add_with_conv", "This will fuse Add operator to Convolution operator");
   add_switch(arser, "--fuse_add_with_fully_connected",
              "This will fuse Add operator to FullyConnected operator");
@@ -109,6 +112,8 @@ int entry(int argc, char **argv)
   add_switch(arser, "--fuse_mean_with_mean",
              "This will fuse two Mean operations when they follow one by one. This will fold them "
              "into one operation and merge reduction indices.");
+  add_switch(arser, "--fuse_mul_to_fullyconnected_weights",
+             "This will fuse Mul to following FullyConnected weights");
   add_switch(arser, "--fuse_mul_with_conv",
              "This will fuse Mul operation with a preceding Conv if possible.");
   add_switch(arser, "--fuse_mul_with_div",
@@ -275,6 +280,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FoldFullyConnected);
   if (arser.get<bool>("--fold_gather"))
     options->enable(Algorithms::FoldGather);
+  if (arser.get<bool>("--fold_mul"))
+    options->enable(Algorithms::FoldMul);
   if (arser.get<bool>("--fold_reshape"))
     options->enable(Algorithms::FoldReshape);
   if (arser.get<bool>("--fold_shape"))
@@ -293,6 +300,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FuseHorizontalFullyConnected);
   if (arser.get<bool>("--fuse_batchnorm_with_conv"))
     options->enable(Algorithms::FuseBatchNormWithConv);
+  if (arser.get<bool>("--fuse_add_to_fullyconnected_bias"))
+    options->enable(Algorithms::FuseAddToFullyConnectedBias);
   if (arser.get<bool>("--fuse_add_with_conv"))
     options->enable(Algorithms::FuseAddWithConv);
   if (arser.get<bool>("--fuse_add_with_fully_connected"))
@@ -303,6 +312,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FuseBatchNormWithDwConv);
   if (arser.get<bool>("--fuse_batchnorm_with_tconv"))
     options->enable(Algorithms::FuseBatchNormWithTConv);
+  if (arser.get<bool>("--fuse_mul_to_fullyconnected_weights"))
+    options->enable(Algorithms::FuseMulToFullyConnectedWeights);
   if (arser.get<bool>("--fuse_slice_with_tconv"))
     options->enable(Algorithms::FuseSliceWithTConv);
   if (arser.get<bool>("--fuse_bcq"))

diff --git a/compiler/luci-pass-value-py-test/test.lst b/compiler/luci-pass-value-py-test/test.lst
@@ -2,14 +2,17 @@
 # Format:
 #   eval(MODEL PASS)
 # MODEL: tflite model file name in build/compiler/common-artifacts folder.
-# PASS: Optimization Pass to test. Supports only one Pass for now.
+# PASS: Optimization Pass to test. Supports one or Passes.
 #
 
 # eval(Net_Preactivation_BN_000 fuse_preactivation_batchnorm) : value diff exist
 # --> https://github.com/Samsung/ONE/issues/5782
 eval(FullyConnected_007 replace_non_const_fc_with_batch_matmul)
 eval(HardSwish_001 decompose_hardswish)
 eval(Net_Add_FloorMod_Gather_000 remove_gather_guard)
+eval(Net_Add_FullyConnected_000 fuse_add_to_fullyconnected_bias)
+eval(Net_Add_FullyConnected_001 fuse_add_to_fullyconnected_bias)
+eval(Net_Add_FullyConnected_002 fuse_add_to_fullyconnected_bias)
 eval(Net_Conv_Add_000 fuse_add_with_conv)
 eval(Net_Conv_Add_001 fuse_add_with_conv)
 # eval(Net_Conv_Add_002 fuse_add_with_conv) --> Conv2D w/o bias fails in tflite interpreter
@@ -40,6 +43,9 @@ eval(Net_Mul_Add_002 remove_unnecessary_add)
 eval(Net_Mul_Add_003 remove_unnecessary_add)
 eval(Net_Mul_Div_000 fuse_mul_with_div)
 eval(Net_Mul_Div_001 fuse_mul_with_div)
+eval(Net_Mul_FullyConnected_000 fuse_mul_to_fullyconnected_weights)
+eval(Net_Mul_FullyConnected_001 fuse_mul_to_fullyconnected_weights)
+eval(Net_Mul_FullyConnected_002 fuse_mul_to_fullyconnected_weights)
 eval(Net_Reshape_Mean_000 forward_reshape_to_unaryop)
 eval(Net_Reshape_Neg_000 forward_reshape_to_unaryop)
 eval(Net_Reshape_Reshape_000 remove_redundant_reshape)

diff --git a/compiler/luci-pass-value-py-test/test_luci_eval.py b/compiler/luci-pass-value-py-test/test_luci_eval.py
@@ -95,8 +95,13 @@ def luci_eval_verify(test_name,
             assert np.allclose(
                 luci_output_data, intp_output_data, rtol=rtolint, atol=atolint), err_msg
         elif output_details["dtype"] == np.float32:
-            assert np.allclose(
-                luci_output_data, intp_output_data, rtol=rtolf32, atol=atolf32), err_msg
+            diff_comp = np.allclose(
+                luci_output_data, intp_output_data, rtol=rtolf32, atol=atolf32)
+            if not diff_comp:
+                print("\r\ntflite:\r\n", intp_output_data, flush=True)
+                print("\r\ncircle:\r\n", luci_output_data, flush=True)
+                print("\r\nDiff:\r\n", intp_output_data - luci_output_data, flush=True)
+                assert diff_comp, err_msg
         elif output_details["dtype"] == np.int64:
             assert np.allclose(
                 luci_output_data, intp_output_data, rtol=rtolint, atol=atolint), err_msg

diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -160,15 +160,18 @@ Current transformation options are
 - fold_dequantize : This removes Dequantize operation which can be folded
 - fold_dwconv : This folds Depthwise Convolution operation which can be folded
 - fold_gather : This removes Gather operation which can be folded
+- fold_mul : This removes Mul operation which can be folded
 - fold_shape : This removes Shape operation which can be folded
 - fold_sparse_to_dense : This removes SparseToDense operation which can be folded
 - forward_reshape_to_unaryop: This will move Reshape after UnaryOp for centain condition
+- fuse_add_to_fullyconnected_bias: This fuses Add operator to following FullyConnected operator bias
 - fuse_add_with_conv: This fuses Add operator with the preceding Convolution operator if possible
 - fuse_add_with_fully_connected: This fuses Add operator with the preceding FullyConnected operator if possible
 - fuse_add_with_tconv: This fuses Add operator with the preceding TConv operator if possible
 - fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
 - fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
 - fuse_batchnorm_with_tconv : This fuses BatchNorm operator to transpose convolution operator
+- fuse_mul_to_fullyconnected_weights : This fuses Mul operator to following FullyConnected operator weights
 - fuse_mul_with_conv: This fuses Mul with a preceding Convolution op if possible.
 - fuse_mul_with_div: This fuses Mul and Div op as Div.
 - fuse_slice_with_tconv: This fuses Slice with a preceding TConv if possible.

diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
@@ -29,19 +29,22 @@ class CONSTANT:
         'fold_dwconv',
         'fold_fully_connected',
         'fold_gather',
+        'fold_mul',
         'fold_reshape',
         'fold_shape',
         'fold_sparse_to_dense',
         'fold_squeeze',
 
         # Operator fusion
+        'fuse_add_to_fullyconnected_bias',
         'fuse_add_with_conv',
         'fuse_add_with_tconv',
         'fuse_add_with_fully_connected',
         'fuse_batchnorm_with_conv',
         'fuse_batchnorm_with_dwconv',
         'fuse_batchnorm_with_tconv',
         'fuse_activation_function',
+        'fuse_mul_to_fullyconnected_weights',
         'fuse_instnorm',
         'fuse_prelu',
         'fuse_gelu',
@@ -104,18 +107,23 @@ class CONSTANT:
         ('fold_dwconv', 'fold Depthwise Convolution op with constant inputs'),
         ('fold_fully_connected', 'fold FullyConnected op with constant inputs'),
         ('fold_gather', 'fold Gather op'),
+        ('fold_mul', 'fold Mul Op'),
         ('fold_reshape', 'fold Reshape op'),
         ('fold_shape', 'fold Shape op'),
         ('fold_sparse_to_dense', 'fold SparseToDense op'),
         ('fold_squeeze', 'fold Squeeze op'),
         ('forward_reshape_to_unaryop', 'Forward Reshape op'),
         ('forward_transpose_op', 'Forward Transpose op'),
+        ('fuse_add_to_fullyconnected_bias',
+         'Fuse Add op to following FullyConnected op bias'),
         ('fuse_add_with_conv', 'fuse Add op to Convolution op'),
         ('fuse_add_with_tconv', 'fuse Add op to Transposed'),
         ('fuse_add_with_fully_connected', 'fuse Add op to FullyConnected op'),
         ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
         ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
         ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
+        ('fuse_mul_to_fullyconnected_weights',
+         'fuse Mul op to following FullyConnected op weights'),
         ('fuse_slice_with_tconv', 'fuse Slice op to Transposed Convolution op'),
         ('fuse_bcq', 'apply Binary Coded Quantization'),
         ('fuse_preactivation_batchnorm',